def _activation(self, Y, L, M, W): """Returns the activation for a given input. Derived from the generative model formulation of hierarchical Poisson mixtures, the formular for the activation in the network reads as follows: I_c = \sum_d \log(W_{cd})y_d + \log(M_{lc}) for labeled data \sum_d \log(W_{cd})y_d + \log(\sum_k M_{kc}) for unlabeled data s_c = softmax(I_c) """ # first: complete inference to find label # Input integration: I = T.tensordot(Y, T.log(W), axes=[1, 1]) # recurrent term: vM = M[L] L_index = T.eq(L, -1).nonzero() vM = T.set_subtensor(vM[L_index], T.sum(M, axis=0)) # numeric trick to prevent overflow in the exp-function max_exponent = 86. - T.ceil(T.log(I.shape[1].astype('float32'))) scale = T.switch(T.gt(T.max(I, axis=1, keepdims=True), max_exponent), T.max(I, axis=1, keepdims=True) - max_exponent, 0.) # numeric approximation to prevent underflow in the exp-function: # map too low values of I to a fixed minimum value min_exponent = -87. + T.ceil(T.log(I.shape[1].astype('float32'))) I = T.switch(T.lt(I - scale, min_exponent), scale + min_exponent, I) # activation: recurrent softmax with overflow protection s = vM * T.exp(I - scale) / T.sum( vM * T.exp(I - scale), axis=1, keepdims=True) return s
def _activation(self, Y, L, M, W): """Returns the activation for a given input. Derived from the generative model formulation of hierarchical Poisson mixtures, the formular for the activation in the network reads as follows: I_c = \sum_d \log(W_{cd})y_d + \log(M_{lc}) for labeled data \sum_d \log(W_{cd})y_d + \log(\sum_k M_{kc}) for unlabeled data s_c = softmax(I_c) """ # first: complete inference to find label # Input integration: I = T.tensordot(Y,T.log(W),axes=[1,1]) # recurrent term: vM = M[L] L_index = T.eq(L,-1).nonzero() vM = T.set_subtensor(vM[L_index], T.sum(M, axis=0)) # numeric trick to prevent overflow in the exp-function max_exponent = 86. - T.ceil(T.log(I.shape[1].astype('float32'))) scale = T.switch( T.gt(T.max(I, axis=1, keepdims=True), max_exponent), T.max(I, axis=1, keepdims=True) - max_exponent, 0.) # numeric approximation to prevent underflow in the exp-function: # map too low values of I to a fixed minimum value min_exponent = -87. + T.ceil(T.log(I.shape[1].astype('float32'))) I = T.switch( T.lt(I-scale, min_exponent), scale+min_exponent, I) # activation: recurrent softmax with overflow protection s = vM*T.exp(I-scale)/T.sum(vM*T.exp(I-scale), axis=1, keepdims=True) return s
def get_output_for(self, inputs, **kwargs): # For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R input = inputs[0] boxes = inputs[1] batch = T.shape(input)[0] channels = T.shape(input)[1] height = T.shape(input)[2] width = T.shape(input)[3] num_boxes = T.shape(boxes)[0] output = T.zeros((batch * num_boxes, channels, self.num_features)) for idbb, bb in enumerate(range(num_boxes)): batch_ind = bb[0] pool_list = [] #for pool_dim in self.pool_dims: start_w = T.clip(T.floor(bb[1] * self.sp_scale), 0, width) start_h = T.clip(T.floor(bb[2] * self.sp_scale), 0, heigth) end_w = T.clip(T.ceil(bb[3] * self.sp_scale), 0, width) end_h = T.clip(T.ceil(bb[4] * self.sp_scale), 0, height) w = T.max(end_w - start_w + 1, 1) h = T.amx(end_h - start_h + 1, 1) start_samples_y, start_sample_x = T.floor( _meshgrid(start_h, end_h, pool_dims + 1, start_w, end_w, pool_dims + 1)) end_samples_y, end_sample_x = T.ceil( _meshgrid(start_h, end_h, pool_dims + 1, start_w, end_w, pool_dims + 1)) input[batch_ind, :, np.floor(py):np.ceil(samples_y[idy + 1]), np.floor(px):np.ceil(samples_x[idx + 1])] #T.max() #for idx,px in enumerate(samples_x[:-1]): # for idy,py in enumerate(samples_y[:-1]): # (pool.dnn_pool( input[batch_ind,:,np.floor(py):np.ceil(samples_y[idy+1]),np.floor(px):np.ceil(samples_x[idx+1])],(0,0),(None,None),'max', (0,0) )).flatten(2) #sz_w = ( w - 1 ) // pool_dim #sz_h = ( h - 1 ) // pool_dim #str_h = w // pool_dim #str_w = h // pool_dim #pool = dnn.dnn_pool( input[bb[0],:,start_h:end_h+1,start_w:end_w+1], (sz_h,sz_w), (str_h,str_w), 'max', (0,0) ).flatten(2) pool_list.append(pool) output[idbb] = T.transpose(T.concatenate( pool_list, axis=1)) #not efficient but for the moment is ok! #if everything is correct this vector should be ordered as in fast RCNN return output
def process(self, input, tparams, BNparams): b, f, h0, w0 = input.shape result = [] for h, w in self.pymamid: win_h = T.ceil(h0 / h).astype('int32') win_w = T.ceil(w0 / w).astype('int32') str_h = T.floor(h0 / h).astype('int32') str_w = T.floor(w0 / w).astype('int32') result.append(dnn_pool( img=input, ws=(win_h, win_w), mode=self.mode, stride=(str_h, str_w), pad=(0, 0)).reshape([b, -1])) return T.concatenate(result, axis=1)
def pool_2d_nxn_regions(inputs, output_size, mode='max'): """ Performs a pooling operation that results in a fixed size: output_size x output_size. Used by SpatialPyramidPoolingLayer. Refer to appendix A in [1] Parameters ---------- inputs : a tensor with 4 dimensions (N x C x H x W) output_size: integer The output size of the pooling operation mode : string Pooling mode, one of 'max', 'average_inc_pad', 'average_exc_pad' Defaults to 'max'. Returns a list of tensors, for each output bin. The list contains output_size*output_size elements, where each element is a 3D tensor (N x C x 1) References ---------- .. [1] He, Kaiming et al (2015): Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition. http://arxiv.org/pdf/1406.4729.pdf. """ if mode == 'max': pooling_op = T.max elif mode in ['average_inc_pad', 'average_exc_pad']: pooling_op = T.mean else: msg = "Mode must be either 'max', 'average_inc_pad' or " msg += "'average_exc_pad'. Got '{0}'" raise ValueError(msg.format(mode)) h, w = inputs.shape[2:] result = [] n = float(output_size) for row in range(output_size): for col in range(output_size): start_h = T.floor(row / n * h).astype('int32') end_h = T.ceil((row + 1) / n * h).astype('int32') start_w = T.floor(col / n * w).astype('int32') end_w = T.ceil((col + 1) / n * w).astype('int32') pooling_region = inputs[:, :, start_h:end_h, start_w:end_w] this_result = pooling_op(pooling_region, axis=(2, 3)) result.append(this_result.dimshuffle(0, 1, 'x')) return result
def blockify( inp, block_size = (1, 1), step_size = (1, 1), direction = (1, 1), padding = False): input_size = T.shape(inp) if padding: b0 = T.ceil((input_size[0] - block_size[0]) / step_size[0]) + 1 b1 = T.ceil((input_size[1] - block_size[1]) / step_size[1]) + 1 else: b0 = T.floor((input_size[0] - block_size[0]) / step_size[0]) + 1 b1 = T.floor((input_size[1] - block_size[1]) / step_size[1]) + 1 num_blocks = b0 * b1 for b in range(num_blocks):
def __theano_train__(self, n_size): """ Pr(l|u, C(l)) = Pr(l|u) * Pr(l|C(l)) Pr(u, l, t) = Pr(l|u, C(l)) if C(l) exists, Pr(l|u) otherwise. $Theta$ = argmax Pr(u, l, t) """ tra_mask = T.ivector() seq_length = T.sum(tra_mask) # 有效长度 wl = T.concatenate((self.wl, self.wl_m)) tidx, cidx, bidx, userid = T.ivector(), T.imatrix(), T.itensor3( ), T.iscalar() pb = self.pb[bidx] # (seq_length x 4 x depth x n_size) lrs = self.lrs[tidx] # (seq_length x 4 x depth) # user preference xu = self.xu[userid] plu = softmax(T.dot(xu, self.wl.T)) # geographical influence cl = T.sum(wl[cidx], axis=1) # (seq_length x n_size) cl = cl.reshape((cl.shape[0], 1, 1, cl.shape[1])) br = sigmoid(T.sum(pb[:seq_length] * cl, axis=3) * lrs[:seq_length]) * T.ceil(abs(T.mean(cl, axis=3))) path = T.prod(br, axis=2) * self.probs[tidx][:seq_length] # paths = T.prod((T.floor(1-path) + path), axis=1) paths = T.sum(path, axis=1) paths = T.floor(1 - paths) + paths # ---------------------------------------------------------------------------- # cost, gradients, learning rate, l2 regularization lr, l2 = self.alpha_lambda[0], self.alpha_lambda[1] seq_l2_sq = T.sum([T.sum(par**2) for par in [xu, self.wl]]) upq = -1 * T.sum(T.log(plu[tidx[:seq_length]] * paths)) / seq_length seq_costs = (upq + 0.5 * l2 * seq_l2_sq) seq_grads = T.grad(seq_costs, self.params) seq_updates = [(par, par - lr * gra) for par, gra in zip(self.params, seq_grads)] pars_subs = [(self.xu, xu), (self.pb, pb)] seq_updates.extend([ (par, T.set_subtensor(sub, sub - lr * T.grad(seq_costs, sub))) for par, sub in pars_subs ]) # ---------------------------------------------------------------------------- uidx = T.iscalar() # T.iscalar()类型是 TensorType(int32, ) self.seq_train = theano.function( inputs=[uidx], outputs=upq, updates=seq_updates, givens={ userid: uidx, tidx: self.tra_target_masks[uidx], cidx: self.tra_context_masks[T.arange(self.tra_accum_lens[uidx][0], self.tra_accum_lens[uidx][1])], bidx: self.routes[self.tra_target_masks[uidx]], tra_mask: self.tra_masks[uidx] # tra_mask_cot: self.tra_masks_cot[T.arange(self.tra_accum_lens[uidx][0], self.tra_accum_lens[uidx][1])] })
def compute_sub_all_scores(self, start_end): plu = softmax( T.dot(self.trained_users[start_end], self.trained_items.T))[:, :-1] # (n_batch, n_item) length = T.max(T.sum(self.tes_masks[start_end], axis=1)) # 253 cidx = T.arange(length).reshape( (1, length)) + self.tra_accum_lens[start_end][:, 0].reshape( (len(start_end), 1)) cl = T.sum(self.trained_items[self.tra_context_masks[cidx]], axis=2) # n_batch x seq_length x n_size cl = cl.dimshuffle(1, 2, 0) pb = self.trained_branch[ self.routes] # (n_item x 4 x tree_depth x n_size) shp0, shp1, shp2 = self.lrs.shape lrs = self.lrs.reshape((shp0, shp1, shp2, 1, 1)) pr_bc = T.dot(pb, cl) br = sigmoid(pr_bc * lrs) * T.ceil( abs(pr_bc)) # (n_item x 4 x tree_depth x seq_length x n_batch) path = T.prod(br, axis=2) * self.probs.reshape((shp0, shp1, 1, 1)) del cl, pb, br, lrs # paths = T.prod((T.floor(1 - path) + path), axis=1) # (n_item x seq_length x n_batch) paths = T.sum(path, axis=1) paths = T.floor(1 - paths) + paths p = paths[:-1].T * plu.reshape( (plu.shape[0], 1, plu.shape[1])) # (n_batch x n_item) # p = plu.reshape((plu.shape[0], 1, plu.shape[1])) * T.ones((plu.shape[0], length, plu.shape[1])) return T.reshape(p, (p.shape[0] * p.shape[1], p.shape[2])).eval()
def compute_hard_windows(self, image_shape, location, scale): # find topleft(front) and bottomright(back) corners for each patch a = location - 0.5 * (T.cast(self.patch_shape, theano.config.floatX) / scale) b = location + 0.5 * (T.cast(self.patch_shape, theano.config.floatX) / scale) # grow by three patch pixels a -= self.kernel.k_sigma_radius(self.cutoff, scale) b += self.kernel.k_sigma_radius(self.cutoff, scale) # clip to fit inside image and have nonempty window a = T.clip(a, 0, image_shape - 1) b = T.clip(b, a + 1, image_shape) if self.batched_window: # take the bounding box of all windows; now the slices # will have the same length for each sample and scan can # be avoided. comes at the cost of typically selecting # more of the input. a = a.min(axis=0, keepdims=True) b = b.max(axis=0, keepdims=True) # make integer a = T.cast(T.floor(a), 'int16') b = T.cast(T.ceil(b), 'int16') return a, b
def get_stencil(self, t, r=None, texp=None): if r is None or texp is None: return tt.shape_padright(t) z = tt.zeros_like(self.a) r = tt.as_tensor_variable(r) R = self.r_star + z hp = 0.5 * self.period if self.ecc is None: # Equation 14 from Winn (2010) k = r / self.r_star arg1 = tt.square(1 + k) - tt.square(self.b) arg2 = tt.square(1 - k) - tt.square(self.b) factor = R / (self.a * self.sin_incl) hdur1 = hp * tt.arcsin(factor * tt.sqrt(arg1)) / np.pi hdur2 = hp * tt.arcsin(factor * tt.sqrt(arg2)) / np.pi ts = [-hdur1, -hdur2, hdur2, hdur1] flag = z else: M_contact1 = self.contact_points_op(self.a, self.ecc, self.cos_omega, self.sin_omega, self.cos_incl + z, self.sin_incl + z, R + r) M_contact2 = self.contact_points_op(self.a, self.ecc, self.cos_omega, self.sin_omega, self.cos_incl + z, self.sin_incl + z, R - r) flag = M_contact1[2] + M_contact2[2] ts = [ tt.mod( (M_contact1[0] - self.M0) / self.n + hp, self.period) - hp, tt.mod( (M_contact2[0] - self.M0) / self.n + hp, self.period) - hp, tt.mod( (M_contact2[1] - self.M0) / self.n + hp, self.period) - hp, tt.mod( (M_contact1[1] - self.M0) / self.n + hp, self.period) - hp ] start = self.period * tt.floor((tt.min(t) - self.t0) / self.period) end = self.period * (tt.ceil((tt.max(t) - self.t0) / self.period) + 1) start += self.t0 end += self.t0 tout = [] for i in range(4): if z.ndim < 1: tout.append(ts[i] + tt.arange(start, end, self.period)) else: tout.append( theano.scan( fn=lambda t0, s0, e0, p0: t0 + tt.arange(s0, e0, p0), sequences=[ts[i], start, end, self.period], )[0].flatten()) ts = tt.sort(tt.concatenate(tout)) return ts, flag
def encode(self, state_below): """ :development: (1) may need to prepend encoding_length * padding array to the state_below to produce the same length sequence as state_below (2) can return an offset encoding by only returing certain indices of the encoding (though this is pretty wasteful) :type state_below: 2d tensor :param state_below: the enitre sequence of states from the layer below the current one :type rval: 2d tensor :param rval: an encoding of the state_below (the entire sequence of state) to be passed to the above layer """ total_sequence_length = T.cast(state_below.shape[0], theano.config.floatX) self.n_encodings = T.cast(T.ceil(total_sequence_length / self.encoding_length), 'int32') self.n_padding_timesteps = T.cast(self.n_encodings * self.encoding_length - total_sequence_length, 'int32') zeros = T.alloc(np.cast[theano.config.floatX](0), self.n_padding_timesteps, self.n_vis) state_below = T.concatenate((zeros, state_below)) Wxh = self.Wxh bxh = self.bxh Whhe = self.Whhe state_below = state_below.reshape((self.encoding_length, self.n_encodings, self.n_vis)) state_below = T.dot(state_below, Wxh) + bxh # a single output will be n_encoding rows with n_hid features each encoding_0 = T.alloc(np.cast[theano.config.floatX](0), self.n_encodings, self.n_hid) encodings, updates = scan(fn=self.encode_step, sequences=[state_below], outputs_info=[encoding_0], non_sequences=[Whhe]) # encodings is a 3d vector (encoding_length, n_encodings, n_hid) # returns encodings[-1] in 2d vector shape = (n_encodings, n_hid) return encodings[-1]
def get_pseudo_likelihood_cost(self, updates): """Stochastic approximation to the pseudo-likelihood""" # index of bit i in expression p(x_i | x_{\i}) bit_i_idx = theano.shared(value=0, name='bit_i_idx') # binarize the input image by rounding to nearest integer xi = T.round(self.input) # calculate free energy for the given bit configuration fe_xi = self.free_energy(xi, self.scaling) # flip bit x_i of matrix xi and preserve all other bits x_{\i} # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx], but assigns # the result to xi_flip, instead of working in place on xi. xi_flip = T.set_subtensor(xi[:, bit_i_idx], 1 - T.ceil(xi[:, bit_i_idx] / (xi[:, bit_i_idx] + 1))) # calculate free energy with bit flipped fe_xi_flip = self.free_energy(xi_flip, self.scaling) # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i}))) cost = T.mean(self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi))) # increment bit_i_idx % number as part of updates updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible return cost
def k_areas_maxpooling(input,k): #dynamic filter size f=int(T.ceil(T.sqrt((input.shape[-1]*input.shape[-2])/float(k)))) #how many zero rows have to inserted to the end so that the size of the filter to fit exaclty to the number of rows. rows_to_insert=input.shape[-1]%f #how many zero columns have to inserted top the end so that the size of the filter to fit exaclty to the number of columns. columns_to_insert=input.shape[-2]%f #insert rows output=T.insert(input, input.shape[2]*T.ones(1).repeat(rows_to_insert), 0, axis=2) #insert columns output=T.insert(output, output.shape[3]*T.ones(1).repeat(columns_to_insert), 0, axis=3) output_shape=output.shape #take max out of every f (filter size) rows output=T.transpose(output, (0,1,3,2)).reshape(output_shape[0],output_shape[1],-1,f).max(3).reshape(output_shape[0],output_shape[1],output_shape[-1],-1).transpose((0,1,3,2)) #take the max out of every f(filter size) columns output=output.reshape(output.shape[0],output.shape[1],-1,f).max(3).reshape(output_shape[0],output_shape[1],output_shape[2]/f,-1) return output
def get_k(self, input_shape): return T.cast( T.max([ self.ktop, T.ceil((self.nroflayers - self.layernr) / float(self.nroflayers) * input_shape[3]) ]), 'int32')
def __init__(self, input_ngram, input_sm, vocab_size, emb_dim, num_section, linear_W_emb=None, fix_emb=False, nonlinear=None, activation=None): global rng global init_range if linear_W_emb is None: # random initialize linear_W_emb = np.asarray(rng.uniform( low=-init_range, high=init_range, size=(vocab_size, emb_dim)), dtype=theano.config.floatX) else: # use the given model parameter given_vocab_size, given_emb_dim = linear_W_emb.shape assert(given_vocab_size == vocab_size and given_emb_dim == emb_dim) # shared variables self.W_emb = theano.shared(value=linear_W_emb, name='W_emb') # stack vectors input_ngram = T.cast(input_ngram, 'int32') input_sm = T.cast(input_sm, 'int32') # output is a matrix where each row correponds to a context_size embedding vector, and row number equals to batch size # output dimensions: batch_size * ((context_size + 1) * emb_dim) output_local = self.W_emb[input_ngram[:, :-1].flatten()].reshape( (input_ngram.shape[0], emb_dim * (input_ngram.shape[1] - 1))) # self.W_emb.shape[1] sentence_lengths = input_sm[:,0] sentence_matrix = input_sm[:,1:] sentence_num = sentence_matrix.shape[0] global_length = sentence_matrix.shape[1] section_length = T.cast(T.ceil(global_length / float(num_section)), 'int32') # For the first section sentence_embeddings = T.mean(self.W_emb[sentence_matrix[:, :section_length].flatten()].reshape( (sentence_num, section_length, emb_dim)), axis=1) # For the rest sections for i in xrange(1, num_section): current_section = T.mean(self.W_emb[sentence_matrix[:, i*section_length:(i+1)*section_length].flatten()].reshape( (sentence_num, section_length, emb_dim)), axis=1) sentence_embeddings = T.concatenate([sentence_embeddings, current_section], axis=1) # get the sentence index for each ngram vector, and transform it to 0-based sentence_indeces = input_ngram[:,-1] base_index = sentence_indeces[0] sentence_indeces = sentence_indeces - base_index # the last column of output should be a weighted sum of the sentence # vectors output_global = sentence_embeddings[sentence_indeces.flatten()].reshape((sentence_indeces.shape[0], emb_dim * num_section)) # handle non-linear layer if nonlinear is None or activation is None: self.output = T.concatenate([output_local, output_global], axis=1) # params is the word embedding matrix self.params = [self.W_emb] if not fix_emb else [] else: self.non_linear_params, non_linear_output_global = addNonlinearLayer(output_global, emb_dim * num_section, nonlinear, activation) self.output = T.concatenate([output_local, non_linear_output_global], axis=1) self.params = [self.W_emb] + self.non_linear_params if not fix_emb else self.non_linear_params
def spp_max_pool_axis_kwargs(in_shape, out_shape): symbolic = (treeano.utils.is_variable(in_shape) or treeano.utils.is_variable(out_shape)) # maxpool requires static shape assert not symbolic if symbolic: int_ceil = lambda x: T.ceil(x).astype("int32") else: int_ceil = lambda x: int(np.ceil(x)) # eg. if input is 5 and output is 2, each pool size should be 3 pool_size = int_ceil(in_shape / out_shape) # stride should equal pool_size, since we want non-overlapping regions stride = pool_size # pad as much as possible, since ignore_border=True padding = int_ceil((pool_size * out_shape - in_shape) / 2) if not symbolic: assert padding < pool_size return dict( ds=pool_size, st=stride, padding=padding, )
def get_output_for(self, input, **kwargs): p = self.p k = self.k nbatches = input.shape[0] x_len = self.x_len # x_len = 30 # x = input.reshape((nbatches, x_len)) x = input.reshape((nbatches, x_len)) p_floor = T.floor(p) p_ceil = T.ceil(p) # Deltas p_delta = p - p_floor ep_delta = T.exp(k*-p_delta) p2_delta = 1 - p_delta ep2_delta = T.exp(k*-p2_delta) p0_delta = 1 + p_delta ep0_delta = T.exp(k*-p0_delta) ep_sum = ep_delta + ep2_delta + ep0_delta perm1 = x[:, (T.cast(p_floor, 'int32'))%x_len] perm2 = x[:, (T.cast(p_ceil, 'int32')+1)%x_len] perm0 = x[:, (T.cast(p_floor, 'int32')-1)%x_len] perm1_factor = ep_delta * perm1 perm2_factor = ep2_delta * perm2 perm3_factor = ep0_delta * perm0 res = (perm1_factor + perm2_factor + perm3_factor) / ep_sum return res.reshape(input.shape)
def get_pseudo_likelihood_cost(self, updates): """Stochastic approximation to the pseudo-likelihood""" # index of bit i in expression p(x_i | x_{\i}) bit_i_idx = theano.shared(value=0, name='bit_i_idx') # binarize the input image by rounding to nearest integer xi = T.round(self.input) # calculate free energy for the given bit configuration fe_xi = self.free_energy(xi, self.scaling) # flip bit x_i of matrix xi and preserve all other bits x_{\i} # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx], but assigns # the result to xi_flip, instead of working in place on xi. xi_flip = T.set_subtensor( xi[:, bit_i_idx], 1 - T.ceil(xi[:, bit_i_idx] / (xi[:, bit_i_idx] + 1))) # calculate free energy with bit flipped fe_xi_flip = self.free_energy(xi_flip, self.scaling) # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i}))) cost = T.mean(self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi))) # increment bit_i_idx % number as part of updates updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible return cost
def get_output_for( self, inputs ,**kwargs ): # For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R input = inputs[0] boxes = inputs[1] batch = T.shape (input)[0] channels = T.shape (input)[1] height = T.shape( input )[2] width = T.shape( input )[3] num_boxes = T.shape(boxes)[0] output = T.zeros((batch * num_boxes , channels, self.num_features)) for idbb,bb in enumerate(range(num_boxes)): batch_ind = bb[0] pool_list = [] #for pool_dim in self.pool_dims: start_w = T.clip(T.floor(bb[1] * self.sp_scale),0,width) start_h = T.clip(T.floor(bb[2] * self.sp_scale),0,heigth) end_w = T.clip(T.ceil(bb[3] * self.sp_scale),0,width) end_h = T.clip(T.ceil(bb[4] * self.sp_scale),0,height) w = T.max(end_w - start_w +1,1) h = T.amx(end_h - start_h +1,1) start_samples_y,start_sample_x = T.floor(_meshgrid(start_h,end_h,pool_dims+1,start_w,end_w,pool_dims+1)) end_samples_y,end_sample_x = T.ceil(_meshgrid(start_h,end_h,pool_dims+1,start_w,end_w,pool_dims+1)) input[batch_ind,:,np.floor(py):np.ceil(samples_y[idy+1]),np.floor(px):np.ceil(samples_x[idx+1])] #T.max() #for idx,px in enumerate(samples_x[:-1]): # for idy,py in enumerate(samples_y[:-1]): # (pool.dnn_pool( input[batch_ind,:,np.floor(py):np.ceil(samples_y[idy+1]),np.floor(px):np.ceil(samples_x[idx+1])],(0,0),(None,None),'max', (0,0) )).flatten(2) #sz_w = ( w - 1 ) // pool_dim #sz_h = ( h - 1 ) // pool_dim #str_h = w // pool_dim #str_w = h // pool_dim #pool = dnn.dnn_pool( input[bb[0],:,start_h:end_h+1,start_w:end_w+1], (sz_h,sz_w), (str_h,str_w), 'max', (0,0) ).flatten(2) pool_list.append( pool ) output[idbb] = T.transpose(T.concatenate( pool_list, axis=1 )) #not efficient but for the moment is ok! #if everything is correct this vector should be ordered as in fast RCNN return output
def get_output_shape_for(self, input_shape): get_k = K.cast( K.max([ self.ktop, T.ceil((self.numLayers - self.currlayer) / float(self.numLayers) * self.inputdim) ]), 'int32') return (input_shape[0], get_k, input_shape[2])
def _build_expression(self, input_expression=None): if self.pool_type not in ['max', 'avg']: raise NotImplementedError( 'Pooling only implemented for max and avg') if input_expression is None: self.input_ = T.tensor4(dtype=self.input_dtype) else: self.input_ = input_expression # Replicating caffe style pooling means zero padding # then strided pooling with ignore_border=True if self.padding in [0, (0, 0)]: padded_input = self.input_ else: zero_padder = ZeroPad(padding=self.padding) zero_padder._build_expression(self.input_) padded_input = zero_padder.expression_ if self.pool_type == 'max': pooled = fancy_max_pool(padded_input, self.pool_shape, self.pool_stride, ignore_border=False) elif self.pool_type == 'avg': # self.pool_shape needs to be a tuple avg_kernel = T.cast(T.ones((1, 1) + self.pool_shape, dtype=self.input_.dtype ) / np.prod(self.pool_shape), self.input_.dtype) n_imgs = self.input_.shape[0] n_channels = self.input_.shape[1] conv_output = T.nnet.conv2d( padded_input.reshape((n_imgs * n_channels, 1, padded_input.shape[2], padded_input.shape[3])), avg_kernel, subsample=self.pool_stride) pooled = conv_output.reshape((n_imgs, n_channels, conv_output.shape[2], conv_output.shape[3])) # A caffe quirk: The output shape is (for width, analogous for h:) # ceil((w + 2 * pad_w - kernel_w) / stride_w) + 1, instead of floor # With floor, ignore_border=True would have yielded the exact result # With ceil, sometimes we need an extra column and/or line. So we do # ignore_border=False and then crop to the right shape. Since the # shape is dynamic we need to first calculate it: # padding gotta be a tuple too pad = T.constant(self.padding) # pad = T.constant(zero_padder.padding_) # supposing here that self.pool_shape is a tuple. Should check pool_shape = T.constant(self.pool_shape) # stride hopefully a tuple, too pool_stride = T.constant(self.pool_stride, dtype='float64') float_shape = (self.input_.shape[2:4] + 2 * pad - pool_shape) / pool_stride + 1 output_shape = T.cast(T.ceil(float_shape), dtype='int64') self.expression_ = pooled[:, :, 0:output_shape[0], 0:output_shape[1]]
def compileActivation(self, net, layerNum): variable = net.x if layerNum == 0 else net.varArrayA[layerNum - 1] #Calc shapes for reshape function on-the-fly. Assume we have square images as input. sX = T.cast(T.sqrt(T.shape(variable)[0] / self.kernel_shape[1]), 'int16') #Converts input from 2 to 4 dimensions Xr = T.reshape(variable.T, (T.shape(variable)[1], self.kernel_shape[1], sX, sX)) if self.optimized: out_size = T.cast( T.ceil((T.shape(Xr)[-1] - T.shape(net.varWeights[layerNum]['w'])[-1] + 1) / np.float32(self.stride)), 'int32') conv_op = FilterActs(stride=self.stride) input_shuffled = Xr.dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_shuffled = net.varWeights[layerNum]['w'].dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_flipped = filters_shuffled[:, ::-1, ::-1, :] # flip rows and columns contiguous_input = gpu_contiguous(input_shuffled) contiguous_filters = gpu_contiguous(filters_flipped * (net.dropOutVectors[layerNum].dimshuffle('x', 0, 1, 'x') if self.dropout else 1.0)) a = conv_op(contiguous_input, contiguous_filters) a = a[:, :out_size, :out_size, :] #Add bias a = a + net.varWeights[layerNum]['b'].dimshuffle(0, 'x', 'x', 'x') else: a = T.nnet.conv2d(Xr, net.varWeights[layerNum]['w'] * (net.dropOutVectors[layerNum].dimshuffle('x', 'x', 0, 1) if self.dropout else 1.0), border_mode='valid', subsample=(self.stride, self.stride)) #Add bias a = a + net.varWeights[layerNum]['b'].dimshuffle('x', 0, 'x', 'x') if self.pooling: if self.optimized: #Pooling # ds - side of square pool window # stride - Defines the stride size between successive pooling squares. # Setting this parameter smaller than sizeX produces overlapping pools. # Setting it equal to sizeX gives the usual, non-overlapping pools. Values greater than sizeX are not allowed. pool_op = MaxPool(ds=self.pooling_shape, stride=self.pooling_shape) contiguous_input = gpu_contiguous(a) a = pool_op(contiguous_input) a = a.dimshuffle(3, 0, 1, 2) # c01b to bc01 else: #a = downsample.max_pool_2d(a, (self.pooling_shape, self.pooling_shape), ignore_border=False) a = pool.max_pool2D(a, (self.pooling_shape, self.pooling_shape), ignore_border=False) else: if self.optimized: a = a.dimshuffle(3, 0, 1, 2) # c01b to bc01 a = T.flatten(a, outdim=2).T #Sigmoid a = self.activation(a, self.pool_size) net.varArrayA.append(a)
def _ppf(self, p): """ The percentile point function (the inverse of the cumulative distribution function) of the discrete Weibull distribution. """ q = self.q beta = self.beta return (tt.ceil(tt.power(tt.log(1 - p) / tt.log(q), 1. / beta)) - 1).astype('int64')
def gaussian_kernel_default_radius(sigma, window_radius=None): if window_radius is None: radius = T.cast(T.max(T.ceil(3 * sigma)), 'int32') if type(sigma) in (float, int): return int(radius.eval()) else: return radius else: return window_radius
def get_hidden_values(self, input, batch_size): self.indices_high = T.ceil(self.indices).astype('int8') self.indices_low = T.floor(self.indices).astype('int8') self.factors_high = self.W[self.indices_high] self.factors_low = self.W[self.indices_low] self.factors = (self.factors_high - self.factors_low) * (self.indices - self.indices_low) / \ (self.indices_high - self.indices_low + 1E-5) + self.factors_low self.output = T.sum(self.x * T.transpose(self.factors).dimshuffle(0, 'x', 1), axis=2) / \ (self.length + 1.0).dimshuffle(0, 'x')
def gaussian_kernel_default_radius(sigma, window_radius=None): if window_radius is None: radius = T.cast(T.max(T.ceil(3*sigma)), 'int32') if type(sigma) in (float, int): return int(radius.eval()) else: return radius else: return window_radius
def _ppf(self, p): r""" The percentile point function (the inverse of the cumulative distribution function) of the discrete Weibull distribution. """ q = self.q beta = self.beta return (tt.ceil(tt.power(tt.log(1 - p) / tt.log(q), 1.0 / beta)) - 1).astype("int64")
def compileActivation(self, net, layerNum): variable = net.x if layerNum == 0 else net.varArrayA[layerNum - 1] #Calc shapes for reshape function on-the-fly. Assume we have square images as input. sX = T.cast(T.sqrt(T.shape(variable)[0] / self.kernel_shape[1]), 'int16') #Converts input from 2 to 4 dimensions Xr = T.reshape(variable.T, (T.shape(variable)[1], self.kernel_shape[1], sX, sX)) if self.optimized: out_size = T.cast( T.ceil((T.shape(Xr)[-1] - T.shape(net.varWeights[layerNum]['w'])[-1] + 1) / np.float32(self.stride)), 'int32') conv_op = FilterActs(stride=self.stride) input_shuffled = Xr.dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_shuffled = net.varWeights[layerNum]['w'].dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_flipped = filters_shuffled[:, ::-1, ::-1, :] # flip rows and columns contiguous_input = gpu_contiguous(input_shuffled) contiguous_filters = gpu_contiguous(filters_flipped * (net.dropOutVectors[layerNum].dimshuffle('x', 0, 1, 'x') if self.dropout else 1.0)) a = conv_op(contiguous_input, contiguous_filters) a = a[:, :out_size, :out_size, :] #Add bias a = a + net.varWeights[layerNum]['b'].dimshuffle(0, 'x', 'x', 'x') else: a = T.nnet.conv2d(Xr, net.varWeights[layerNum]['w'] * (net.dropOutVectors[layerNum].dimshuffle('x', 'x', 0, 1) if self.dropout else 1.0), border_mode='valid', subsample=(self.stride, self.stride)) #Add bias a = a + net.varWeights[layerNum]['b'].dimshuffle('x', 0, 'x', 'x') if self.pooling: if self.optimized: #Pooling # ds - side of square pool window # stride - Defines the stride size between successive pooling squares. # Setting this parameter smaller than sizeX produces overlapping pools. # Setting it equal to sizeX gives the usual, non-overlapping pools. Values greater than sizeX are not allowed. pool_op = MaxPool(ds=self.pooling_shape, stride=self.pooling_shape) contiguous_input = gpu_contiguous(a) a = pool_op(contiguous_input) a = a.dimshuffle(3, 0, 1, 2) # c01b to bc01 else: a = downsample.max_pool_2d(a, (self.pooling_shape, self.pooling_shape), ignore_border=False) else: if self.optimized: a = a.dimshuffle(3, 0, 1, 2) # c01b to bc01 a = T.flatten(a, outdim=2).T #Sigmoid a = self.activation(a, self.pool_size) net.varArrayA.append(a)
def dynamic_k_max_pooling(input, sent_sizes, k_max_factor, k_max_final): """ k_max_factor -- multiplied by sentence_sizes gives the value of kmax for each sentence """ # Unroll input into (batch_size x nchannels x nwords) x ndim nbatches, nchannels, nwords, ndim = input.shape[0], input.shape[ 1], input.shape[2], input.shape[3] x = input.dimshuffle(0, 1, 3, 2) sent_sizes = T.cast(T.ceil(sent_sizes * k_max_factor), dtype='int32') sent_sizes = T.maximum(sent_sizes, k_max_final) # sent_sizes_matrix = T.repeat(sent_sizes, nwords, axis=1) sent_sizes_matrix = T.repeat(sent_sizes.dimshuffle(0, 'x'), nwords, axis=1) idx = T.arange(nwords).dimshuffle('x', 0) idx_matrix = T.repeat(idx, nbatches, axis=0) sent_sizes_mask = T.lt(idx_matrix, sent_sizes_matrix)[:, ::-1] neighborsArgSorted = T.argsort(x, axis=3) neighborsArgSorted_masked = ( (neighborsArgSorted + 1) * sent_sizes_mask.dimshuffle(0, 'x', 'x', 1)) - 1 neighborsArgSorted_masked_sorted = neighborsArgSorted_masked.sort(axis=3) nwords_max = T.cast(T.ceil(nwords * k_max_factor), 'int32') # print nwords_max.eval() neighborsArgSorted_masked_sorted_clipped = neighborsArgSorted_masked_sorted[:, :, :, -nwords_max:] ax0 = T.repeat(T.arange(nbatches), nchannels * ndim * nwords_max) ax1 = T.repeat(T.arange(nchannels), ndim * nwords_max).dimshuffle('x', 0) ax1 = T.repeat(ax1, nbatches, axis=0).flatten() ax2 = T.repeat(T.arange(ndim), nwords_max, axis=0).dimshuffle('x', 'x', 0) ax2 = T.repeat(ax2, nchannels, axis=1) ax2 = T.repeat(ax2, nbatches, axis=0).flatten() ax3 = neighborsArgSorted_masked_sorted_clipped.flatten() pooled_out = x[ax0, ax1, ax2, ax3] pooled_out = pooled_out.reshape( (nbatches, nchannels, ndim, nwords_max)).dimshuffle(0, 1, 3, 2) return pooled_out
def weighted_vector_mse(self, y_true, y_pred): self.y_true = y_true self.y_pred = y_pred weight = T.ceil(self.y_true) loss = T.square(weight * (self.y_true - self.y_pred)) # use appropriate relations for other objectives. E.g, for binary_crossentropy: #loss = weights * (y_true * T.log(y_pred) + (1.0 - y_true) * T.log(1.0 - y_pred)) return T.mean(T.sum(loss, axis=-1))
def call(self, x, mask=None): get_k = K.cast( K.max([ self.ktop, T.ceil((self.numLayers - self.currlayer) / float(self.numLayers) * self.inputdim) ]), 'int32') output = x[T.arange(x.shape[0]).dimshuffle(0, "x", "x"), T.sort(T.argsort(x, axis=1)[:, -get_k:, :], axis=1), T.arange(x.shape[2]).dimshuffle("x", "x", 0)] return output
def pool(self, x, mode, pool_size, strides, padding=(0, 0)): if strides is None: strides = pool_size assert len(strides) == len(pool_size) do2D = len(pool_size) == 2 if mode == 'avg': mode = 'average_exc_pad' # theano requires symmetric padding # We pad the larger on when two sides' padding are unequal max_padding = list(padding) for i, p in enumerate(padding): if isinstance(p, tuple): assert p[1] == p[0] + 1 max_padding[i] = p[1] else: max_padding[i] = p if do2D: pool_out = pool.pool_2d(x, ws=pool_size, stride=strides, ignore_border=True, pad=max_padding, mode=mode) else: # pool over HW pool_out = pool.pool_2d(x.dimshuffle(0, 1, 4, 2, 3), ws=pool_size[:2], stride=strides[:2], ignore_border=True, pad=max_padding[:2], mode=mode) # pool over Z pool_out = pool.pool_2d(pool_out.dimshuffle(0, 1, 3, 4, 2), ws=(1, pool_size[2]), stride=(1, strides[2]), ignore_border=True, pad=(0, max_padding[2]), mode=mode) # theano might output more than expected output shape (due to max padding). We truncate them here exp_l = [] for i in range(len(strides)): c = T.ceil(self.cast(x.shape[i + 2], _FLOATX) / strides[i]) exp_l.append(self.cast(c, 'int32')) if do2D: return pool_out[:, :, :exp_l[0], :exp_l[1]] else: return pool_out[:, :, :exp_l[0], :exp_l[1], :exp_l[2]]
def set_k_max(layer, k_top, layer_position, nb_layers, sentence_length): """ Set k_max based on the number of convolutional layers, and the layer position in the network. http://nal.co/papers/Kalchbrenner_DCNN_ACL14 """ alpha = (nb_layers - layer_position) * 1. / nb_layers layer.k_max = T.maximum( k_top, T.cast(T.ceil(sentence_length * alpha), 'int32') )
def get_stencil(self, t, r=None, texp=None): if r is None or texp is None: return tt.shape_padright(t) z = tt.zeros_like(self.a) r = tt.as_tensor_variable(r) R = self.r_star + z hp = 0.5 * self.period if self.ecc is None: # Equation 14 from Winn (2010) k = r / self.r_star arg1 = tt.square(1 + k) - tt.square(self.b) arg2 = tt.square(1 - k) - tt.square(self.b) factor = R / (self.a * self.sin_incl) hdur1 = hp * tt.arcsin(factor * tt.sqrt(arg1)) / np.pi hdur2 = hp * tt.arcsin(factor * tt.sqrt(arg2)) / np.pi ts = [-hdur1, -hdur2, hdur2, hdur1] flag = z else: M_contact1 = self.contact_points_op( self.a, self.ecc, self.cos_omega, self.sin_omega, self.cos_incl + z, self.sin_incl + z, R + r) M_contact2 = self.contact_points_op( self.a, self.ecc, self.cos_omega, self.sin_omega, self.cos_incl + z, self.sin_incl + z, R - r) flag = M_contact1[2] + M_contact2[2] ts = [ tt.mod((M_contact1[0]-self.M0)/self.n+hp, self.period)-hp, tt.mod((M_contact2[0]-self.M0)/self.n+hp, self.period)-hp, tt.mod((M_contact2[1]-self.M0)/self.n+hp, self.period)-hp, tt.mod((M_contact1[1]-self.M0)/self.n+hp, self.period)-hp ] start = self.period * tt.floor((tt.min(t) - self.t0) / self.period) end = self.period * (tt.ceil((tt.max(t) - self.t0) / self.period) + 1) start += self.t0 end += self.t0 tout = [] for i in range(4): if z.ndim < 1: tout.append(ts[i] + tt.arange(start, end, self.period)) else: tout.append(theano.scan( fn=lambda t0, s0, e0, p0: t0 + tt.arange(s0, e0, p0), sequences=[ts[i], start, end, self.period], )[0].flatten()) ts = tt.sort(tt.concatenate(tout)) return ts, flag
def R2_RNN_block(tparams, inputs, prefix=None, name='r2_rnn', std=True): prefix = GetPrefix(prefix, name) n_steps = inputs.shape[0] n_samples = inputs.shape[1] x_size = inputs.shape[2] r_steps = T.ceil(T.log2(n_steps)).astype('uint32') r_steps = T.arange(r_steps) # r_steps=r_steps.reshape([r_steps.shape[0],1]); def _step_inner(index, num, inps): index = index * 2 index_ = T.minimum(index + 2, num) h = RNN_layer(tparams, inps[index:index_, :, :], prefix=prefix, name=None, std=False) return h[-1, :, :] def _step(r_step, num, inps, std=True): n = num steps = T.arange((n + 1) / 2) # steps=steps.reshape([steps.shape[0],1]); out, updates = theano.scan( lambda index, num, inps: _step_inner(index, num, inps), sequences=[steps], outputs_info=None, non_sequences=[num, inps], name=_p(prefix, 'inner_scan'), n_steps=steps.shape[0], profile=False) # if std: out=standardize(out); num = out.shape[0] h = T.zeros_like(inps) h = T.set_subtensor(h[:num], out) return num, h # return out; if std: inputs = standardize(inputs) out, updates = theano.reduce( lambda r_step, num, inps: _step(r_step, num, inps), sequences=r_steps, outputs_info=[inputs.shape[0], inputs], # non_sequences=inputs, name=_p(prefix, 'scan')) return out[1][:out[0]]
def dynamic_k_max_pooling(input, sent_sizes, k_max_factor, k_max_final): """ k_max_factor -- multiplied by sentence_sizes gives the value of kmax for each sentence """ # Unroll input into (batch_size x nchannels x nwords) x ndim nbatches, nchannels, nwords, ndim = input.shape[0], input.shape[1], input.shape[2], input.shape[3] x = input.dimshuffle(0,1,3,2) sent_sizes = T.cast(T.ceil(sent_sizes * k_max_factor), dtype='int32') sent_sizes = T.maximum(sent_sizes, k_max_final) # sent_sizes_matrix = T.repeat(sent_sizes, nwords, axis=1) sent_sizes_matrix = T.repeat(sent_sizes.dimshuffle(0, 'x'), nwords, axis=1) idx = T.arange(nwords).dimshuffle('x', 0) idx_matrix = T.repeat(idx, nbatches, axis=0) sent_sizes_mask = T.lt(idx_matrix, sent_sizes_matrix)[:,::-1] neighborsArgSorted = T.argsort(x, axis=3) neighborsArgSorted_masked = ((neighborsArgSorted + 1) * sent_sizes_mask.dimshuffle(0,'x','x',1)) - 1 neighborsArgSorted_masked_sorted = neighborsArgSorted_masked.sort(axis=3) nwords_max = T.cast(T.ceil(nwords * k_max_factor), 'int32') # print nwords_max.eval() neighborsArgSorted_masked_sorted_clipped = neighborsArgSorted_masked_sorted[:,:,:,-nwords_max:] ax0 = T.repeat(T.arange(nbatches), nchannels*ndim*nwords_max) ax1 = T.repeat(T.arange(nchannels), ndim * nwords_max).dimshuffle('x', 0) ax1 = T.repeat(ax1, nbatches, axis=0).flatten() ax2 = T.repeat(T.arange(ndim), nwords_max, axis=0).dimshuffle('x', 'x', 0) ax2 = T.repeat(ax2, nchannels, axis=1) ax2 = T.repeat(ax2, nbatches, axis=0).flatten() ax3 = neighborsArgSorted_masked_sorted_clipped.flatten() pooled_out = x[ax0, ax1, ax2, ax3] pooled_out = pooled_out.reshape((nbatches, nchannels, ndim, nwords_max)).dimshuffle(0,1,3,2) return pooled_out
def _get_valid_cost(self, input_vec, *args, **kwargs): idx = tensor.ceil(input_vec.shape[0] * self.config.sample_percent_for_test).astype('int32') new_input_vec = input_vec[0:idx] preds = self._get_pred_dist(new_input_vec) ranks = tensor.argsort(preds, axis=1)[:, ::-1] top1_accuracy = tensor.eq(self.hashtag[0:idx], ranks[:, 0]).mean() top10_accuracy = tensor.sum(tensor.eq(ranks[:, 0:self.rank], self.hashtag[0:idx, None]), axis=1).mean() top1_accuracy.name = "top1_accuracy" top10_accuracy.name = "top10_accuracy" self.monitor_valid_vars = [[top1_accuracy], [top10_accuracy]] self.stop_monitor_var = top10_accuracy
def __init__(self, numpy_rng, theano_rng, input, input_shape, indices, length, max_length=30, n_out=1, batch_size=100, W=None): self.n_out = n_out self.n_in = input_shape[1] self.x = input #3D tensor self.indices = indices #2D tensor self.length = length #1D tensor self.max_length = float(max_length) self.numpy_rng = numpy_rng self.theano_rng = theano_rng init_W = [ 0.54457003, 0.72741562, 1.39331913, 1.12367916, 0.79878163, 0.27706152, 0.3593896, 0.39622781, 0.27895978, 0.23260947, 0.26763204, 0.27084899, 0.07067534, 0.13463201, 0.07948229, 0.02779013, 0.12053657, 0.14807181, 0.24277158, 0.36964679, 0.1601541, 0.37342793, 0.47257897, 0.39729786, 0.56589139, 0.30535939, 0.10021771, 0.07151619, 0.12510002, 0.3112531, 0.43562451, 0.05050614, 0.07199406, 0.50659907, 0.42588547 ] if W is None: W_values = numpy.asarray( self.numpy_rng.uniform(low=0.5, high=0.5, size=(self.n_in)), #init_W, # numpy.linspace(1.0, 0.0, self.n_in), dtype=theano.config.floatX) self.W = theano.shared(value=W_values, name='W', borrow=True) else: self.W = W self.indices_high = T.ceil(self.indices).astype('int8') self.indices_low = T.floor(self.indices).astype('int8') self.factors_high = self.W[self.indices_high] self.factors_low = self.W[self.indices_low] self.factors = (self.factors_high - self.factors_low) * (self.indices - self.indices_low) / \ (self.indices_high - self.indices_low + 1E-5) + self.factors_low self.output = T.sum(self.x * T.transpose(self.factors).dimshuffle(0, 'x', 1), axis=2) / \ (self.length + 1.0).dimshuffle(0, 'x') self.params = [self.W]
class dynamicKMaxPoolingLayer(Layer): def __init__(self, incoming, kTop, numOfLayers, layerNumber, **kwargs): super(dynamicKMaxPoolingLayer, self).__init__(incoming, **kwargs) self.kTop = kTop self.numOfLayers = numOfLayers self.layerNumber = layerNumber # As per the definition in Kalchbrenner's paper, the # k value for k-max pooling is dynamically given as : self.k = T.cast(T.max([self.kTop, T.ceil((self.numOfLayers - self.layerNumber)*self.input_shape[3]/float(self.numOfLayers))]), 'int16') def get_output_for(self, input)
def get_hidden_values(self, input): # convolve input feature maps with filters self.conv_out = conv.conv2d( input=input, filters=self.W, border_mode="full", filter_shape=self.kshp, image_shape=self.imshp ) # k-max pooling. k = T.cast(T.max((self.k_Top, T.ceil(self.factor * self.s))), "int32") pool_shape = self.conv_out.shape pool = self.kmaxPool(self.conv_out, pool_shape, k) output = T.tanh(pool + self.b.dimshuffle("x", 0, "x", "x")) self.shape = output.shape return output
def R2_RNN_block(tparams,inputs,prefix=None,name='r2_rnn',std=True): prefix=GetPrefix(prefix,name); n_steps=inputs.shape[0]; n_samples=inputs.shape[1]; x_size=inputs.shape[2]; r_steps=T.ceil(T.log2(n_steps)).astype('uint32'); r_steps=T.arange(r_steps); # r_steps=r_steps.reshape([r_steps.shape[0],1]); def _step_inner(index,num,inps): index=index*2; index_=T.minimum(index+2,num); h=RNN_layer(tparams,inps[index:index_,:,:],prefix=prefix,name=None,std=False); return h[-1,:,:]; def _step(r_step,num,inps,std=True): n=num; steps=T.arange((n+1)/2); # steps=steps.reshape([steps.shape[0],1]); out,updates=theano.scan(lambda index,num,inps:_step_inner(index,num,inps), sequences=[steps], outputs_info=None, non_sequences=[num,inps], name=_p(prefix,'inner_scan'), n_steps=steps.shape[0], profile=False); # if std: out=standardize(out); num=out.shape[0]; h=T.zeros_like(inps); h=T.set_subtensor(h[:num],out); return num,h; # return out; if std: inputs=standardize(inputs); out,updates=theano.reduce(lambda r_step,num,inps:_step(r_step,num,inps), sequences=r_steps, outputs_info=[inputs.shape[0],inputs], # non_sequences=inputs, name=_p(prefix,'scan') ); return out[1][:out[0]];
def attend(self, y_p): inp, updates = 0, {} z = T.dot(y_p,self.T_W) + self.T_b #idx = self.I[self.n[0]] #y_out = T.cast(self.y_t[self.n[0]],'int32') #nll, _ = T.nnet.crossentropy_softmax_1hot(x=z[idx], y_idx=y_out[idx]) smooth = T.constant(self.attrs['smooth'], 'float32') #n = T.cast(self.n[0],'int32') n = T.cast(self.ns, 'int32') t = T.dot(T.nnet.softmax(z), T.arange(self.base[0].attrs['max_skip'],dtype='float32')) #+ numpy.float32(1) #t = T.cast(T.argmax(z,axis=1), 'float32' ) t = smooth * self.y_t[n,T.arange(self.y_t.shape[1]),T.cast(self.t,'int32')] + (numpy.float32(1) - smooth) * t pos = T.cast(T.ceil(self.t), 'int32') inp = T.dot(self.B[pos,T.arange(pos.shape[0])], self.W_att_in) #updates[self.cost_sum] = T.sum(nll,dtype='float32').dimshuffle('x').repeat(1,axis=0) updates[self.t] = T.maximum(self.t - t, numpy.float32(0)) updates[self.ns] = self.ns - numpy.float32(1) return inp, updates
def pad_to_a_multiple(tensor_, k, pad_with): """Pad a tensor to make its first dimension a multiple of a number. Parameters ---------- tensor_ : :class:`~theano.Variable` k : int The number, multiple of which the length of tensor is made. pad_with : float or int The value for padding. """ new_length = ( tensor.ceil(tensor_.shape[0].astype('float32') / k) * k).astype('int64') new_shape = tensor.set_subtensor(tensor_.shape[:1], new_length) canvas = tensor.alloc(pad_with, tensor.prod(new_shape)).reshape( new_shape, ndim=tensor_.ndim) return tensor.set_subtensor(canvas[:tensor_.shape[0]], tensor_)
def Output(self): # Convolve input with trained parameters. conv_out = conv.conv2d(input=self.x, filters=self.W, border_mode='full', filter_shape=self.kshp, image_shape=self.imshp) # Fold conv result into two. if self.do_fold: fold = self.Fold(conv_out) # k-max pooling. k = T.cast(T.max((self.k_Top, T.ceil(self.factor * self.s))), 'int32') if self.do_fold: pool_shape = fold.shape pooled_out = self.kmaxPool(fold, pool_shape, k) else: pool_shape = conv_out.shape pooled_out = self.kmaxPool(conv_out, pool_shape, k) return T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
def __init__(self, numpy_rng, theano_rng, input, input_shape, indices, length, max_length=30, n_out=1, batch_size=100, W=None): self.n_out = n_out self.n_in = input_shape[1] self.x = input #3D tensor self.indices = indices #2D tensor self.length = length #1D tensor self.max_length = float(max_length) self.numpy_rng = numpy_rng self.theano_rng = theano_rng init_W = [ 0.54457003, 0.72741562, 1.39331913, 1.12367916, 0.79878163, 0.27706152, 0.3593896 , 0.39622781, 0.27895978, 0.23260947, 0.26763204, 0.27084899, 0.07067534, 0.13463201, 0.07948229, 0.02779013, 0.12053657, 0.14807181, 0.24277158, 0.36964679, 0.1601541 , 0.37342793, 0.47257897, 0.39729786, 0.56589139, 0.30535939, 0.10021771, 0.07151619, 0.12510002, 0.3112531 , 0.43562451, 0.05050614, 0.07199406, 0.50659907, 0.42588547] if W is None: W_values = numpy.asarray( self.numpy_rng.uniform( low=0.5, high=0.5, size=(self.n_in) ), #init_W, # numpy.linspace(1.0, 0.0, self.n_in), dtype=theano.config.floatX ) self.W = theano.shared(value=W_values, name='W', borrow=True) else: self.W = W self.indices_high = T.ceil(self.indices).astype('int8') self.indices_low = T.floor(self.indices).astype('int8') self.factors_high = self.W[self.indices_high] self.factors_low = self.W[self.indices_low] self.factors = (self.factors_high - self.factors_low) * (self.indices - self.indices_low) / \ (self.indices_high - self.indices_low + 1E-5) + self.factors_low self.output = T.sum(self.x * T.transpose(self.factors).dimshuffle(0, 'x', 1), axis=2) / \ (self.length + 1.0).dimshuffle(0, 'x') self.params = [self.W]
def weighted_sentence(sentence, sent_len, W): sec_length = T.cast(T.ceil(sent_len / float(num_section)), 'int32') # for every section except the last one for sec_num in xrange(num_section-1): sec_start_id = sec_num * sec_length sec_end_id = (sec_num+1) * sec_length sec_vector = T.mean(W[sentence[sec_start_id:sec_end_id].flatten()], axis=0) if sec_num == 0: global_vector = sec_vector else: global_vector = T.concatenate([global_vector, sec_vector], axis=0) # here is axis=0 because sec_vector is a vector # for the last section sec_start_id = (num_section - 1) * sec_length sec_end_id = sent_len # if sec_start_id >= sent_len, it means this section should contain 0 words, so use EOS embedding instead. sec_vector = T.switch(T.ge(sec_start_id, sent_len), W[io_vocab.VocabConstants.EOS_INDEX], T.mean(W[sentence[sec_start_id:sec_end_id].flatten()], axis=0)) # num_section > 1 global_vector = T.concatenate([global_vector, sec_vector], axis=0) global_vector_for_short = W[sentence[:num_section].flatten()].reshape((1, emb_dim * num_section)) return T.switch(T.gt(num_section, sent_len), global_vector_for_short, global_vector)
def get_hidden_values(self, input): # convolve input feature maps with filters self.conv_out = conv.conv2d( input=input, filters=self.W, border_mode="full", filter_shape=self.kshp, image_shape=self.imshp ) # k-max pooling. k = T.cast(T.max((self.k_Top, T.ceil(self.factor * self.s))), "int32") pool_shape = self.conv_out.shape pool = self.kmaxPool(self.conv_out, pool_shape, k) output = T.tanh(pool + self.b.dimshuffle("x", 0, "x", "x")) self.shape = output.shape hidden_input = output.flatten(2) self.fully_connected = AE( (self.rng), input=hidden_input, n_visible=self.kshp[0] * 25 * self.k_Top, n_hidden=60 ) # nkerns[0] replaced with 8 self.params.extend(self.fully_connected.params) return self.fully_connected.get_hidden_values(hidden_input)
def apply(self, image, image_shape, location, scale): a, b = self.compute_hard_windows(image_shape, location, scale) if hasattr(self, "cropop"): patch = self.cropop(image, a, b, location, scale) else: # make integer a = T.cast(T.floor(a), 'int16') b = T.cast(T.ceil(b), 'int16') if self.batched_window: # take the bounding box of all windows; now the slices # will have the same length for each sample and scan can # be avoided. comes at the cost of typically selecting # more of the input. a = a.min(axis=0, keepdims=True) b = b.max(axis=0, keepdims=True) patch = self.apply_inner(image, location, scale, a[0], b[0]) elif self.scan: def map_fn(image, a, b, location, scale): # apply_inner expects a batch axis image = T.shape_padleft(image) location = T.shape_padleft(location) scale = T.shape_padleft(scale) patch = self.apply_inner(image, location, scale, a, b) # return without batch axis return patch[0] patch, _ = theano.map( map_fn, sequences=[image, a, b, location, scale]) savings = (1 - T.cast((b - a).prod(axis=1), floatX) / image_shape.prod(axis=1)) return patch, savings
def take_glimpses(self, attended, preprocessed_attended=None, attended_mask=None, weights=None, step=None, **states): # Cut the considered window. p = self.prior length = attended.shape[0] prior_type = p.get('type', 'expanding') if prior_type=='expanding': begin = p['initial_begin'] + step[0] * p['min_speed'] end = p['initial_end'] + step[0] * p['max_speed'] begin = tensor.maximum(0, tensor.minimum(length - 1, begin)) end = tensor.maximum(0, tensor.minimum(length, end)) additional_mask = None elif prior_type.startswith('window_around'): #check whether we want the mean or median! if prior_type == 'window_around_mean': position_in_attended = tensor.arange(length, dtype=floatX)[None, :] expected_last_source_pos = (weights * position_in_attended).sum(axis=1) elif prior_type == 'window_around_median': ali_to_05 = tensor.extra_ops.cumsum(weights, axis=1) - 0.5 ali_to_05 = (ali_to_05>=0) ali_median_pos = ali_to_05[:,1:] - ali_to_05[:,:-1] expected_last_source_pos = tensor.argmax(ali_median_pos, axis=1) expected_last_source_pos = theano.gradient.disconnected_grad( expected_last_source_pos) else: raise ValueError #the window taken around each element begins = tensor.floor(expected_last_source_pos - p['before']) ends = tensor.ceil(expected_last_source_pos + p['after']) #the global window to optimize computations begin = tensor.maximum(0, begins.min()).astype('int64') end = tensor.minimum(length, ends.max()).astype('int64') #the new mask, already cut to begin:end position_in_attended_cut = tensor.arange( begin * 1., end * 1., 1., dtype=floatX)[None, :] additional_mask = ((position_in_attended_cut > begins[:,None]) * (position_in_attended_cut < ends[:,None])) else: raise Exception("Unknown prior type: %s", prior_type) begin = tensor.floor(begin).astype('int64') end = tensor.ceil(end).astype('int64') attended_cut = attended[begin:end] preprocessed_attended_cut = (preprocessed_attended[begin:end] if preprocessed_attended else None) attended_mask_cut = ( (attended_mask[begin:end] if attended_mask else None) * (additional_mask.T if additional_mask else 1)) weights_cut = weights[:, begin:end] # Call energies_cut = self.compute_energies(attended_cut, preprocessed_attended_cut, weights_cut, states) weights_cut = self.compute_weights(energies_cut, attended_mask_cut) weighted_averages = self.compute_weighted_averages(weights_cut, attended_cut) # Paste new_weights = new_energies = tensor.zeros_like(weights.T) new_weights = tensor.set_subtensor(new_weights[begin:end], weights_cut) new_energies = tensor.set_subtensor(new_energies[begin:end], energies_cut) return weighted_averages, new_weights.T, new_energies.T, step + 1
def __init__(self, n_out = None, n_units = None, direction = 1, truncation = -1, sampling = 1, encoder = None, unit = 'lstm', n_dec = 0, attention = "none", recurrent_transform = "none", recurrent_transform_attribs = "{}", attention_template = 128, attention_distance = 'l2', attention_step = "linear", attention_beam = 0, attention_norm = "exp", attention_momentum = "none", attention_sharpening = 1.0, attention_nbest = 0, attention_store = False, attention_smooth = False, attention_glimpse = 1, attention_filters = 1, attention_accumulator = 'sum', attention_loss = 0, attention_bn = 0, attention_lm = 'none', attention_ndec = 1, attention_memory = 0, attention_alnpts = 0, attention_epoch = 1, attention_segstep=0.01, attention_offset=0.95, attention_method="epoch", attention_scale=10, context=-1, base = None, aligner = None, lm = False, force_lm = False, droplm = 1.0, forward_weights_init=None, bias_random_init_forget_shift=0.0, copy_weights_from_base=False, segment_input=False, join_states=False, sample_segment=None, **kwargs): """ :param n_out: number of cells :param n_units: used when initialized via Network.from_hdf_model_topology :param direction: process sequence in forward (1) or backward (-1) direction :param truncation: gradient truncation :param sampling: scan every nth frame only :param encoder: list of encoder layers used as initalization for the hidden state :param unit: cell type (one of 'lstm', 'vanilla', 'gru', 'sru') :param n_dec: absolute number of steps to unfold the network if integer, else relative number of steps from encoder :param recurrent_transform: name of recurrent transform :param recurrent_transform_attribs: dictionary containing parameters for a recurrent transform :param attention_template: :param attention_distance: :param attention_step: :param attention_beam: :param attention_norm: :param attention_sharpening: :param attention_nbest: :param attention_store: :param attention_align: :param attention_glimpse: :param attention_lm: :param base: list of layers which outputs are considered as based during attention mechanisms :param lm: activate RNNLM :param force_lm: expect previous labels to be given during testing :param droplm: probability to take the expected output as predecessor instead of the real one when LM=true :param bias_random_init_forget_shift: initialize forget gate bias of lstm networks with this value """ source_index = None if len(kwargs['sources']) == 1 and (kwargs['sources'][0].layer_class.endswith('length') or kwargs['sources'][0].layer_class.startswith('length')): kwargs['sources'] = [] source_index = kwargs['index'] unit_given = unit from Device import is_using_gpu if unit == 'lstm': # auto selection if not is_using_gpu(): unit = 'lstme' elif recurrent_transform == 'none' and (not lm or droplm == 0.0): unit = 'lstmp' else: unit = 'lstmc' elif unit in ("lstmc", "lstmp") and not is_using_gpu(): unit = "lstme" if segment_input: if is_using_gpu(): unit = "lstmps" else: unit = "lstms" if n_out is None: assert encoder n_out = sum([enc.attrs['n_out'] for enc in encoder]) kwargs.setdefault("n_out", n_out) if n_units is not None: assert n_units == n_out self.attention_weight = T.constant(1.,'float32') if len(kwargs['sources']) == 1 and kwargs['sources'][0].layer_class.startswith('length'): kwargs['sources'] = [] elif len(kwargs['sources']) == 1 and kwargs['sources'][0].layer_class.startswith('signal'): kwargs['sources'] = [] super(RecurrentUnitLayer, self).__init__(**kwargs) self.set_attr('from', ",".join([s.name for s in self.sources]) if self.sources else "null") self.set_attr('n_out', n_out) self.set_attr('unit', unit_given.encode("utf8")) self.set_attr('truncation', truncation) self.set_attr('sampling', sampling) self.set_attr('direction', direction) self.set_attr('lm', lm) self.set_attr('force_lm', force_lm) self.set_attr('droplm', droplm) if bias_random_init_forget_shift: self.set_attr("bias_random_init_forget_shift", bias_random_init_forget_shift) self.set_attr('attention_beam', attention_beam) self.set_attr('recurrent_transform', recurrent_transform.encode("utf8")) if isinstance(recurrent_transform_attribs, str): recurrent_transform_attribs = json.loads(recurrent_transform_attribs) if attention_template is not None: self.set_attr('attention_template', attention_template) self.set_attr('recurrent_transform_attribs', recurrent_transform_attribs) self.set_attr('attention_distance', attention_distance.encode("utf8")) self.set_attr('attention_step', attention_step.encode("utf8")) self.set_attr('attention_norm', attention_norm.encode("utf8")) self.set_attr('attention_sharpening', attention_sharpening) self.set_attr('attention_nbest', attention_nbest) attention_store = attention_store or attention_smooth or attention_momentum != 'none' self.set_attr('attention_store', attention_store) self.set_attr('attention_smooth', attention_smooth) self.set_attr('attention_momentum', attention_momentum.encode('utf8')) self.set_attr('attention_glimpse', attention_glimpse) self.set_attr('attention_filters', attention_filters) self.set_attr('attention_lm', attention_lm) self.set_attr('attention_bn', attention_bn) self.set_attr('attention_accumulator', attention_accumulator) self.set_attr('attention_ndec', attention_ndec) self.set_attr('attention_memory', attention_memory) self.set_attr('attention_loss', attention_loss) self.set_attr('n_dec', n_dec) self.set_attr('segment_input', segment_input) self.set_attr('attention_alnpts', attention_alnpts) self.set_attr('attention_epoch', attention_epoch) self.set_attr('attention_segstep', attention_segstep) self.set_attr('attention_offset', attention_offset) self.set_attr('attention_method', attention_method) self.set_attr('attention_scale', attention_scale) if segment_input: if not self.eval_flag: #if self.eval_flag: if isinstance(self.sources[0],RecurrentUnitLayer): self.inv_att = self.sources[0].inv_att #NBT else: if not join_states: self.inv_att = self.sources[0].attention #NBT else: assert hasattr(self.sources[0], "nstates"), "source does not have number of states!" ns = self.sources[0].nstates self.inv_att = self.sources[0].attention[(ns-1)::ns] inv_att = T.roll(self.inv_att.dimshuffle(2, 1, 0),1,axis=0)#TBN inv_att = T.set_subtensor(inv_att[0],T.zeros((inv_att.shape[1],inv_att.shape[2]))) inv_att = T.max(inv_att,axis=-1) else: inv_att = T.zeros((self.sources[0].output.shape[0],self.sources[0].output.shape[1])) if encoder and hasattr(encoder[0],'act'): self.set_attr('encoder', ",".join([e.name for e in encoder])) if base: self.set_attr('base', ",".join([b.name for b in base])) else: base = encoder self.base = base self.encoder = encoder if aligner: self.aligner = aligner self.set_attr('n_units', n_out) unit = eval(unit.upper())(**self.attrs) assert isinstance(unit, Unit) self.unit = unit kwargs.setdefault("n_out", unit.n_out) n_out = unit.n_out self.set_attr('n_out', unit.n_out) if n_dec < 0: source_index = self.index n_dec *= -1 if n_dec != 0: self.target_index = self.index if isinstance(n_dec,float): if not source_index: source_index = encoder[0].index if encoder else base[0].index lengths = T.cast(T.ceil(T.sum(T.cast(source_index,'float32'),axis=0) * n_dec), 'int32') idx, _ = theano.map(lambda l_i, l_m:T.concatenate([T.ones((l_i,),'int8'),T.zeros((l_m-l_i,),'int8')]), [lengths], [T.max(lengths)+1]) self.index = idx.dimshuffle(1,0)[:-1] n_dec = T.cast(T.ceil(T.cast(source_index.shape[0],'float32') * numpy.float32(n_dec)),'int32') else: if encoder: self.index = encoder[0].index self.index = T.ones((n_dec,self.index.shape[1]),'int8') else: n_dec = self.index.shape[0] # initialize recurrent weights self.W_re = None if unit.n_re > 0: self.W_re = self.add_param(self.create_recurrent_weights(unit.n_units, unit.n_re, name="W_re_%s" % self.name)) # initialize forward weights bias_init_value = self.create_bias(unit.n_in).get_value() if bias_random_init_forget_shift: assert unit.n_units * 4 == unit.n_in # (input gate, forget gate, output gate, net input) bias_init_value[unit.n_units:2 * unit.n_units] += bias_random_init_forget_shift self.b.set_value(bias_init_value) if not forward_weights_init: forward_weights_init = "random_uniform(p_add=%i)" % unit.n_re else: self.set_attr('forward_weights_init', forward_weights_init) self.forward_weights_init = forward_weights_init self.W_in = [] sample_mean, gamma = None, None if copy_weights_from_base: self.params = {} #self.W_re = self.add_param(base[0].W_re) #self.W_in = [ self.add_param(W) for W in base[0].W_in ] #self.b = self.add_param(base[0].b) self.W_re = base[0].W_re self.W_in = base[0].W_in self.b = base[0].b if self.attrs.get('batch_norm', False): sample_mean = base[0].sample_mean gamma = base[0].gamma #self.masks = base[0].masks #self.mass = base[0].mass else: for s in self.sources: W = self.create_forward_weights(s.attrs['n_out'], unit.n_in, name="W_in_%s_%s" % (s.name, self.name)) self.W_in.append(self.add_param(W)) # make input z = self.b for x_t, m, W in zip(self.sources, self.masks, self.W_in): if x_t.attrs['sparse']: if x_t.output.ndim == 3: out_dim = x_t.output.shape[2] elif x_t.output.ndim == 2: out_dim = 1 else: assert False, x_t.output.ndim if x_t.output.ndim == 3: z += W[T.cast(x_t.output[:,:,0], 'int32')] elif x_t.output.ndim == 2: z += W[T.cast(x_t.output, 'int32')] else: assert False, x_t.output.ndim elif m is None: z += T.dot(x_t.output, W) else: z += self.dot(self.mass * m * x_t.output, W) #if self.attrs['batch_norm']: # z = self.batch_norm(z, unit.n_in) num_batches = self.index.shape[1] self.num_batches = num_batches non_sequences = [] if self.attrs['lm'] or attention_lm != 'none': if not 'target' in self.attrs: self.attrs['target'] = 'classes' if self.attrs['droplm'] > 0.0 or not (self.train_flag or force_lm): if copy_weights_from_base: self.W_lm_in = base[0].W_lm_in self.b_lm_in = base[0].b_lm_in else: l = sqrt(6.) / sqrt(unit.n_out + self.y_in[self.attrs['target']].n_out) values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(unit.n_out, self.y_in[self.attrs['target']].n_out)), dtype=theano.config.floatX) self.W_lm_in = self.add_param(self.shared(value=values, borrow=True, name = "W_lm_in_"+self.name)) self.b_lm_in = self.create_bias(self.y_in[self.attrs['target']].n_out, 'b_lm_in') l = sqrt(6.) / sqrt(unit.n_in + self.y_in[self.attrs['target']].n_out) values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(self.y_in[self.attrs['target']].n_out, unit.n_in)), dtype=theano.config.floatX) if copy_weights_from_base: self.W_lm_out = base[0].W_lm_out else: self.W_lm_out = self.add_param(self.shared(value=values, borrow=True, name = "W_lm_out_"+self.name)) if self.attrs['droplm'] == 0.0 and (self.train_flag or force_lm): self.lmmask = 1 #if recurrent_transform != 'none': # recurrent_transform = recurrent_transform[:-3] elif self.attrs['droplm'] < 1.0 and (self.train_flag or force_lm): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams srng = RandomStreams(self.rng.randint(1234) + 1) self.lmmask = T.cast(srng.binomial(n=1, p=1.0 - self.attrs['droplm'], size=self.index.shape), theano.config.floatX).dimshuffle(0,1,'x').repeat(unit.n_in,axis=2) else: self.lmmask = T.zeros_like(self.index, dtype='float32').dimshuffle(0,1,'x').repeat(unit.n_in,axis=2) if recurrent_transform == 'input': # attention is just a sequence dependent bias (lstmp compatible) src = [] src_names = [] n_in = 0 for e in base: #src_base = [ s for s in e.sources if s.name not in src_names ] #src_names += [ s.name for s in e.sources ] src_base = [ e ] src_names += [e.name] src += [s.output for s in src_base] n_in += sum([s.attrs['n_out'] for s in src_base]) self.xc = T.concatenate(src, axis=2) l = sqrt(6.) / sqrt(self.attrs['n_out'] + n_in) values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(n_in, 1)), dtype=theano.config.floatX) self.W_att_xc = self.add_param(self.shared(value=values, borrow=True, name = "W_att_xc")) values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(n_in, self.attrs['n_out'] * 4)), dtype=theano.config.floatX) self.W_att_in = self.add_param(self.shared(value=values, borrow=True, name = "W_att_in")) zz = T.exp(T.tanh(T.dot(self.xc, self.W_att_xc))) # TB1 self.zc = T.dot(T.sum(self.xc * (zz / T.sum(zz, axis=0, keepdims=True)).repeat(self.xc.shape[2],axis=2), axis=0, keepdims=True), self.W_att_in) recurrent_transform = 'none' elif recurrent_transform == 'attention_align': max_skip = base[0].attrs['max_skip'] values = numpy.zeros((max_skip,), dtype=theano.config.floatX) self.T_b = self.add_param(self.shared(value=values, borrow=True, name="T_b"), name="T_b") l = sqrt(6.) / sqrt(self.attrs['n_out'] + max_skip) values = numpy.asarray(self.rng.uniform( low=-l, high=l, size=(self.attrs['n_out'], max_skip)), dtype=theano.config.floatX) self.T_W = self.add_param(self.shared(value=values, borrow=True, name="T_W"), name="T_W") y_t = T.dot(self.base[0].attention, T.arange(self.base[0].output.shape[0], dtype='float32')) # NB y_t = T.concatenate([T.zeros_like(y_t[:1]), y_t], axis=0) # (N+1)B y_t = y_t[1:] - y_t[:-1] # NB self.y_t = y_t # T.clip(y_t,numpy.float32(0),numpy.float32(max_skip - 1)) self.y_t = T.cast(self.base[0].backtrace,'float32') elif recurrent_transform == 'attention_segment': assert aligner.attention, "Segment-wise attention requires attention points!" recurrent_transform_inst = RecurrentTransform.transform_classes[recurrent_transform](layer=self) assert isinstance(recurrent_transform_inst, RecurrentTransform.RecurrentTransformBase) unit.recurrent_transform = recurrent_transform_inst self.recurrent_transform = recurrent_transform_inst # scan over sequence for s in range(self.attrs['sampling']): index = self.index[s::self.attrs['sampling']] if context > 0: from TheanoUtil import context_batched n_batches = z.shape[1] time, batch, dim = z.shape[0], z.shape[1], z.shape[2] #z = context_batched(z[::direction or 1], window=context)[::direction or 1] # TB(CD) from theano.ifelse import ifelse def context_window(idx, x_in, i_in): x_out = x_in[idx:idx + context] x_out = x_out.dimshuffle('x',1,0,2).reshape((1, batch, dim * context)) i_out = i_in[idx:idx+1].repeat(context, axis=0) i_out = ifelse(T.lt(idx,context),T.set_subtensor(i_out[:context - idx],numpy.int8(0)),i_out).reshape((1, batch * context)) return x_out, i_out z = z[::direction or 1] i = index[::direction or 1] out, _ = theano.map(context_window, sequences = [T.arange(z.shape[0])], non_sequences = [T.concatenate([T.zeros((context - 1,z.shape[1],z.shape[2]),dtype='float32'),z],axis=0), i]) z = out[0][::direction or 1] i = out[1][::direction or 1] # T(BC) direction = 1 z = z.reshape((time * batch, context * dim)) # (TB)(CD) z = z.reshape((time * batch, context, dim)).dimshuffle(1,0,2) # C(TB)D i = i.reshape((time, context, batch)).dimshuffle(1,0,2).reshape((context, time * batch)) index = i num_batches = time * batch sequences = z sources = self.sources if encoder: if recurrent_transform == "attention_segment": if hasattr(encoder[0],'act'): outputs_info = [T.concatenate([e.act[i][-1] for e in encoder], axis=1) for i in range(unit.n_act)] else: # outputs_info = [ T.concatenate([e[i] for e in encoder], axis=1) for i in range(unit.n_act) ] outputs_info[0] = self.aligner.output[-1] elif hasattr(encoder[0],'act'): outputs_info = [ T.concatenate([e.act[i][-1] for e in encoder], axis=1) for i in range(unit.n_act) ] else: outputs_info = [ T.concatenate([e[i] for e in encoder], axis=1) for i in range(unit.n_act) ] sequences += T.alloc(numpy.cast[theano.config.floatX](0), n_dec, num_batches, unit.n_in) + (self.zc if self.attrs['recurrent_transform'] == 'input' else numpy.float32(0)) else: outputs_info = [ T.alloc(numpy.cast[theano.config.floatX](0), num_batches, unit.n_units) for a in range(unit.n_act) ] if self.attrs['lm'] and self.attrs['droplm'] == 0.0 and (self.train_flag or force_lm): if self.network.y[self.attrs['target']].ndim == 3: sequences += T.dot(self.network.y[self.attrs['target']],self.W_lm_out) else: y = self.y_in[self.attrs['target']].flatten() sequences += self.W_lm_out[y].reshape((index.shape[0],index.shape[1],unit.n_in)) if sequences == self.b: sequences += T.alloc(numpy.cast[theano.config.floatX](0), n_dec, num_batches, unit.n_in) + (self.zc if self.attrs['recurrent_transform'] == 'input' else numpy.float32(0)) if unit.recurrent_transform: outputs_info += unit.recurrent_transform.get_sorted_state_vars_initial() index_f = T.cast(index, theano.config.floatX) unit.set_parent(self) if segment_input: outputs = unit.scan_seg(x=sources, z=sequences[s::self.attrs['sampling']], att = inv_att, non_sequences=non_sequences, i=index_f, outputs_info=outputs_info, W_re=self.W_re, W_in=self.W_in, b=self.b, go_backwards=direction == -1, truncate_gradient=self.attrs['truncation']) else: outputs = unit.scan(x=sources, z=sequences[s::self.attrs['sampling']], non_sequences=non_sequences, i=index_f, outputs_info=outputs_info, W_re=self.W_re, W_in=self.W_in, b=self.b, go_backwards=direction == -1, truncate_gradient=self.attrs['truncation']) if not isinstance(outputs, list): outputs = [outputs] if outputs: outputs[0].name = "%s.act[0]" % self.name if context > 0: for i in range(len(outputs)): outputs[i] = outputs[i][-1].reshape((outputs[i].shape[1]//n_batches,n_batches,outputs[i].shape[2])) if unit.recurrent_transform: unit.recurrent_transform_state_var_seqs = outputs[-len(unit.recurrent_transform.state_vars):] if self.attrs['sampling'] > 1: if s == 0: self.act = [ T.alloc(numpy.cast['float32'](0), self.index.shape[0], self.index.shape[1], n_out) for act in outputs ] self.act = [ T.set_subtensor(tot[s::self.attrs['sampling']], act) for tot,act in zip(self.act, outputs) ] else: self.act = outputs[:unit.n_act] if len(outputs) > unit.n_act: self.aux = outputs[unit.n_act:] if self.attrs['attention_store']: self.attention = [ self.aux[i].dimshuffle(0,2,1) for i,v in enumerate(sorted(unit.recurrent_transform.state_vars.keys())) if v.startswith('att_') ] # NBT for i in range(len(self.attention)): vec = T.eye(self.attention[i].shape[2], 1, -direction * (self.attention[i].shape[2] - 1)) last = vec.dimshuffle(1, 'x', 0).repeat(self.index.shape[1], axis=1) self.attention[i] = T.concatenate([self.attention[i][1:],last],axis=0)[::direction] self.cost_val = numpy.float32(0) if recurrent_transform == 'attention_align': back = T.ceil(self.aux[sorted(unit.recurrent_transform.state_vars.keys()).index('t')]) def make_output(base, yout, trace, length): length = T.cast(length, 'int32') idx = T.cast(trace[:length][::-1],'int32') x_out = T.concatenate([base[idx],T.zeros((self.index.shape[0] + 1 - length, base.shape[1]), 'float32')],axis=0) y_out = T.concatenate([yout[idx,T.arange(length)],T.zeros((self.index.shape[0] + 1 - length, ), 'float32')],axis=0) return x_out, y_out output, _ = theano.map(make_output, sequences = [base[0].output.dimshuffle(1,0,2), self.y_t.dimshuffle(1,2,0), back.dimshuffle(1,0), T.sum(self.index,axis=0,dtype='float32')]) self.attrs['n_out'] = base[0].attrs['n_out'] self.params.update(unit.params) self.output = output[0].dimshuffle(1,0,2)[:-1] z = T.dot(self.act[0], self.T_W)[:-1] + self.T_b z = z.reshape((z.shape[0] * z.shape[1], z.shape[2])) idx = (self.index[1:].flatten() > 0).nonzero() idy = (self.index[1:][::-1].flatten() > 0).nonzero() y_out = T.cast(output[1],'int32').dimshuffle(1, 0)[:-1].flatten() nll, _ = T.nnet.crossentropy_softmax_1hot(x=z[idx], y_idx=y_out[idy]) self.cost_val = T.sum(nll) recog = T.argmax(z[idx], axis=1) real = y_out[idy] self.errors = lambda: T.sum(T.neq(recog, real)) return back += T.arange(self.index.shape[1], dtype='float32') * T.cast(self.base[0].index.shape[0], 'float32') idx = (self.index[:-1].flatten() > 0).nonzero() idx = T.cast(back[::-1].flatten()[idx],'int32') x_out = base[0].output #x_out = x_out.dimshuffle(1,0,2).reshape((x_out.shape[0] * x_out.shape[1], x_out.shape[2]))[idx] #x_out = x_out.reshape((self.index.shape[1], self.index.shape[0] - 1, x_out.shape[1])).dimshuffle(1,0,2) x_out = x_out.reshape((x_out.shape[0] * x_out.shape[1], x_out.shape[2]))[idx] x_out = x_out.reshape((self.index.shape[0] - 1, self.index.shape[1], x_out.shape[1])) self.output = T.concatenate([x_out, base[0].output[1:]],axis=0) self.attrs['n_out'] = base[0].attrs['n_out'] self.params.update(unit.params) return skips = T.dot(T.nnet.softmax(z), T.arange(z.shape[1], dtype='float32')).reshape(self.index[1:].shape) shift = T.arange(self.index.shape[1], dtype='float32') * T.cast(self.base[0].index.shape[0], 'float32') skips = T.concatenate([T.zeros_like(self.y_t[:1]),self.y_t[:-1]],axis=0) idx = shift + T.cumsum(skips, axis=0) idx = T.cast(idx[:-1].flatten(),'int32') #idx = (idx.flatten() > 0).nonzero() #idx = base[0].attention.flatten() x_out = base[0].output[::-1] x_out = x_out.reshape((x_out.shape[0] * x_out.shape[1], x_out.shape[2]))[idx] x_out = x_out.reshape((self.index.shape[0], self.index.shape[1], x_out.shape[1])) self.output = T.concatenate([base[0].output[-1:], x_out], axis=0)[::-1] self.attrs['n_out'] = base[0].attrs['n_out'] self.params.update(unit.params) return if recurrent_transform == 'batch_norm': self.params['sample_mean_batch_norm'].custom_update = T.dot(T.mean(self.act[0],axis=[0,1]),self.W_re) self.params['sample_mean_batch_norm'].custom_update_normalized = True self.make_output(self.act[0][::direction or 1], sample_mean=sample_mean, gamma=gamma) self.params.update(unit.params)
import theano.tensor as T from theano.tensor import shared_randomstreams import numpy as np import numpy.random from scipy.special import gammaincinv from numpy.linalg import norm # tensor stand-in for np.random.RandomState rngT = shared_randomstreams.RandomStreams() rng = numpy.random.RandomState() # {{{ Fastfood Params }}} n, d = T.dscalars('n', 'd') # transform dimensions to be a power of 2 d0, n0 = d, n l = T.ceil(T.log2(d)) # TODO cast to int d = 2**l k = T.ceil(n/d) # TODO cast to int n = d*k # generate parameter 'matrices' B = rng.choice([-1, 1], size=(k, d)) G = rng.normal(size=(k, d), dtype=np.float64) PI = np.array([rng.permutation(d) for _ in xrange(k)]).T S = np.empty((k*d, 1), dtype=np.float64) # generate scaling matrix, S for i in xrange(k): for j in xrange(d): p1 = rng.uniform(size=d) p2 = d/2 Tmp = gammaincinv(p2, p1) Tmp = T.sqrt(2*Tmp)
def experiment(state, outdir_base='./'): rng.seed(1) # seed the numpy random generator # Initialize output directory and files data.mkdir_p(outdir_base) outdir = outdir_base + "/" + state.dataset + "/" data.mkdir_p(outdir) logfile = outdir + "log.txt" with open(logfile, 'w') as f: f.write("MODEL 2, {0!s}\n\n".format(state.dataset)) train_convergence_pre = outdir + "train_convergence_pre.csv" train_convergence_post = outdir + "train_convergence_post.csv" valid_convergence_pre = outdir + "valid_convergence_pre.csv" valid_convergence_post = outdir + "valid_convergence_post.csv" test_convergence_pre = outdir + "test_convergence_pre.csv" test_convergence_post = outdir + "test_convergence_post.csv" print print "----------MODEL 2, {0!s}--------------".format(state.dataset) print # load parameters from config file if this is a test config_filename = outdir + 'config' if state.test_model and 'config' in os.listdir(outdir): config_vals = load_from_config(config_filename) for CV in config_vals: print CV if CV.startswith('test'): print 'Do not override testing switch' continue try: exec('state.' + CV) in globals(), locals() except: exec('state.' + CV.split('=')[0] + "='" + CV.split('=')[1] + "'") in globals(), locals() else: # Save the current configuration # Useful for logs/experiments print 'Saving config' with open(config_filename, 'w') as f: f.write(str(state)) print state # Load the data, train = train+valid, and sequence artificial = False if state.dataset == 'MNIST_1' or state.dataset == 'MNIST_2' or state.dataset == 'MNIST_3': (train_X, train_Y), (valid_X, valid_Y), (test_X, test_Y) = data.load_mnist(state.data_path) train_X = numpy.concatenate((train_X, valid_X)) train_Y = numpy.concatenate((train_Y, valid_Y)) artificial = True try: dataset = int(state.dataset.split('_')[1]) except: raise AssertionError("artificial dataset number not recognized. Input was " + state.dataset) else: raise AssertionError("dataset not recognized.") train_X = theano.shared(train_X) train_Y = theano.shared(train_Y) valid_X = theano.shared(valid_X) valid_Y = theano.shared(valid_Y) test_X = theano.shared(test_X) test_Y = theano.shared(test_Y) if artificial: print 'Sequencing MNIST data...' print 'train set size:', len(train_Y.eval()) print 'valid set size:', len(valid_Y.eval()) print 'test set size:', len(test_Y.eval()) data.sequence_mnist_data(train_X, train_Y, valid_X, valid_Y, test_X, test_Y, dataset, rng) print 'train set size:', len(train_Y.eval()) print 'valid set size:', len(valid_Y.eval()) print 'test set size:', len(test_Y.eval()) print 'Sequencing done.' print N_input = train_X.eval().shape[1] root_N_input = numpy.sqrt(N_input) # Network and training specifications layers = state.layers # number hidden layers walkbacks = state.walkbacks # number of walkbacks layer_sizes = [N_input] + [state.hidden_size] * layers # layer sizes, from h0 to hK (h0 is the visible layer) learning_rate = theano.shared(cast32(state.learning_rate)) # learning rate annealing = cast32(state.annealing) # exponential annealing coefficient momentum = theano.shared(cast32(state.momentum)) # momentum term # PARAMETERS : weights list and bias list. # initialize a list of weights and biases based on layer_sizes weights_list = [get_shared_weights(layer_sizes[i], layer_sizes[i + 1], name="W_{0!s}_{1!s}".format(i, i + 1)) for i in range(layers)] # initialize each layer to uniform sample from sqrt(6. / (n_in + n_out)) recurrent_weights_list = [ get_shared_weights(layer_sizes[i + 1], layer_sizes[i], name="V_{0!s}_{1!s}".format(i + 1, i)) for i in range(layers)] # initialize each layer to uniform sample from sqrt(6. / (n_in + n_out)) bias_list = [get_shared_bias(layer_sizes[i], name='b_' + str(i)) for i in range(layers + 1)] # initialize each layer to 0's. # Theano variables and RNG MRG = RNG_MRG.MRG_RandomStreams(1) X = T.fmatrix('X') Xs = [T.fmatrix(name="X_initial") if i == 0 else T.fmatrix(name="X_" + str(i + 1)) for i in range(walkbacks + 1)] hiddens_input = [X] + [T.fmatrix(name="h_" + str(i + 1)) for i in range(layers)] hiddens_output = hiddens_input[:1] + hiddens_input[1:] # Check variables for bad inputs and stuff if state.batch_size > len(Xs): warnings.warn( "Batch size should not be bigger than walkbacks+1 (len(Xs)) unless you know what you're doing. You need to know the sequence length beforehand.") if state.batch_size <= 0: raise AssertionError("batch size cannot be <= 0") ''' F PROP ''' if state.hidden_act == 'sigmoid': print 'Using sigmoid activation for hiddens' hidden_activation = T.nnet.sigmoid elif state.hidden_act == 'rectifier': print 'Using rectifier activation for hiddens' hidden_activation = lambda x: T.maximum(cast32(0), x) elif state.hidden_act == 'tanh': print 'Using hyperbolic tangent activation for hiddens' hidden_activation = lambda x: T.tanh(x) else: raise AssertionError("Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid".format( state.hidden_act)) if state.visible_act == 'sigmoid': print 'Using sigmoid activation for visible layer' visible_activation = T.nnet.sigmoid elif state.visible_act == 'softmax': print 'Using softmax activation for visible layer' visible_activation = T.nnet.softmax else: raise AssertionError( "Did not recognize visible activation {0!s}, please use sigmoid or softmax".format(state.visible_act)) def update_layers(hiddens, p_X_chain, Xs, sequence_idx, noisy=True, sampling=True): print 'odd layer updates' update_odd_layers(hiddens, noisy) print 'even layer updates' update_even_layers(hiddens, p_X_chain, Xs, sequence_idx, noisy, sampling) # choose the correct output for hidden_outputs based on batch_size and walkbacks (this is due to an issue with batches, see note in run_story2.py) if state.batch_size <= len(Xs) and sequence_idx == state.batch_size - 1: return hiddens else: return None print 'done full update.' print # Odd layer update function # just a loop over the odd layers def update_odd_layers(hiddens, noisy): for i in range(1, len(hiddens), 2): print 'updating layer', i simple_update_layer(hiddens, None, None, None, i, add_noise=noisy) # Even layer update # p_X_chain is given to append the p(X|...) at each full update (one update = odd update + even update) def update_even_layers(hiddens, p_X_chain, Xs, sequence_idx, noisy, sampling): for i in range(0, len(hiddens), 2): print 'updating layer', i simple_update_layer(hiddens, p_X_chain, Xs, sequence_idx, i, add_noise=noisy, input_sampling=sampling) # The layer update function # hiddens : list containing the symbolic theano variables [visible, hidden1, hidden2, ...] # layer_update will modify this list inplace # p_X_chain : list containing the successive p(X|...) at each update # update_layer will append to this list # add_noise : pre and post activation gaussian noise def simple_update_layer(hiddens, p_X_chain, Xs, sequence_idx, i, add_noise=True, input_sampling=True): # Compute the dot product, whatever layer # If the visible layer X if i == 0: print 'using', recurrent_weights_list[i] hiddens[i] = (T.dot(hiddens[i + 1], recurrent_weights_list[i]) + bias_list[i]) # If the top layer elif i == len(hiddens) - 1: print 'using', weights_list[i - 1] hiddens[i] = T.dot(hiddens[i - 1], weights_list[i - 1]) + bias_list[i] # Otherwise in-between layers else: # next layer : hiddens[i+1], assigned weights : W_i # previous layer : hiddens[i-1], assigned weights : W_(i-1) print "using {0!s} and {1!s}".format(weights_list[i - 1], recurrent_weights_list[i]) hiddens[i] = T.dot(hiddens[i + 1], recurrent_weights_list[i]) + T.dot(hiddens[i - 1], weights_list[i - 1]) + \ bias_list[i] # Add pre-activation noise if NOT input layer if i == 1 and state.noiseless_h1: print '>>NO noise in first hidden layer' add_noise = False # pre activation noise if i != 0 and add_noise: print 'Adding pre-activation gaussian noise for layer', i hiddens[i] = add_gaussian_noise(hiddens[i], state.hidden_add_noise_sigma) # ACTIVATION! if i == 0: print 'Sigmoid units activation for visible layer X' hiddens[i] = visible_activation(hiddens[i]) else: print 'Hidden units {} activation for layer'.format(state.act), i hiddens[i] = hidden_activation(hiddens[i]) # post activation noise # why is there post activation noise? Because there is already pre-activation noise, this just doubles the amount of noise between each activation of the hiddens. # if i != 0 and add_noise: # print 'Adding post-activation gaussian noise for layer', i # hiddens[i] = add_gaussian(hiddens[i], state.hidden_add_noise_sigma) # build the reconstruction chain if updating the visible layer X if i == 0: # if input layer -> append p(X|...) p_X_chain.append(hiddens[i]) # what the predicted next input should be if sequence_idx + 1 < len(Xs): next_input = Xs[sequence_idx + 1] # sample from p(X|...) - SAMPLING NEEDS TO BE CORRECT FOR INPUT TYPES I.E. FOR BINARY MNIST SAMPLING IS BINOMIAL. real-valued inputs should be gaussian if input_sampling: print 'Sampling from input' sampled = MRG.binomial(p=next_input, size=next_input.shape, dtype='float32') else: print '>>NO input sampling' sampled = next_input # add noise sampled = salt_and_pepper(sampled, state.input_salt_and_pepper) # DOES INPUT SAMPLING MAKE SENSE FOR SEQUENTIAL? - not really since it was used in walkbacks which was gibbs. # set input layer hiddens[i] = sampled def build_graph(hiddens, Xs, noisy=True, sampling=True): predicted_X_chain = [] # the visible layer that gets generated at each update_layers run H_chain = [] # either None or hiddens that gets generated at each update_layers run, this is used to determine what the correct hiddens_output should be print "Building the graph :", walkbacks, "updates" for i in range(walkbacks): print "Forward Prediction {!s}/{!s}".format(i + 1, walkbacks) H_chain.append(update_layers(hiddens, predicted_X_chain, Xs, i, noisy, sampling)) return predicted_X_chain, H_chain '''Build the main training graph''' # corrupt x hiddens_output[0] = salt_and_pepper(hiddens_output[0], state.input_salt_and_pepper) # build the computation graph and the generated visible layers and appropriate hidden_output predicted_X_chain, H_chain = build_graph(hiddens_output, Xs, noisy=True, sampling=state.input_sampling) # predicted_X_chain, H_chain = build_graph(hiddens_output, Xs, noisy=False, sampling=state.input_sampling) #testing one-hot without noise # choose the correct output for hiddens_output (this is due to the issue with batches - see note in run_story2.py) # this finds the not-None element of H_chain and uses that for hiddens_output h_empty = [True if h is None else False for h in H_chain] if False in h_empty: # if there was a not-None element hiddens_output = H_chain[h_empty.index(False)] # set hiddens_output to the appropriate element from H_chain ###################### # COST AND GRADIENTS # ###################### print if state.cost_funct == 'binary_crossentropy': print 'Using binary cross-entropy cost!' cost_function = lambda x, y: T.mean(T.nnet.binary_crossentropy(x, y)) elif state.cost_funct == 'square': print "Using square error cost!" cost_function = lambda x, y: T.mean(T.sqr(x - y)) else: raise AssertionError( "Did not recognize cost function {0!s}, please use binary_crossentropy or square".format(state.cost_funct)) print 'Cost w.r.t p(X|...) at every step in the graph' costs = [cost_function(predicted_X_chain[i], Xs[i + 1]) for i in range(len(predicted_X_chain))] # outputs for the functions show_COSTs = [costs[0]] + [costs[-1]] # cost for the gradient # care more about the immediate next predictions rather than the future - use exponential decay # COST = T.sum(costs) COST = T.sum([T.exp(-i / T.ceil(walkbacks / 3)) * costs[i] for i in range(len(costs))]) params = weights_list + recurrent_weights_list + bias_list print "params:", params print "creating functions..." gradient = T.grad(COST, params) gradient_buffer = [theano.shared(numpy.zeros(param.get_value().shape, dtype='float32')) for param in params] m_gradient = [momentum * gb + (cast32(1) - momentum) * g for (gb, g) in zip(gradient_buffer, gradient)] param_updates = [(param, param - learning_rate * mg) for (param, mg) in zip(params, m_gradient)] gradient_buffer_updates = zip(gradient_buffer, m_gradient) updates = OrderedDict(param_updates + gradient_buffer_updates) # odd layer h's not used from input -> calculated directly from even layers (starting with h_0) since the odd layers are updated first. f_cost = theano.function(inputs=hiddens_input + Xs, outputs=hiddens_output + show_COSTs, on_unused_input='warn') f_learn = theano.function(inputs=hiddens_input + Xs, updates=updates, outputs=hiddens_output + show_COSTs, on_unused_input='warn') print "functions done." print ############# # Denoise some numbers : show number, noisy number, reconstructed number ############# import random as R R.seed(1) # a function to add salt and pepper noise f_noise = theano.function(inputs=[X], outputs=salt_and_pepper(X, state.input_salt_and_pepper)) # Recompile the graph without noise for reconstruction function - the input x_recon is already going to be noisy, and this is to test on a simulated 'real' input. X_recon = T.fvector("X_recon") Xs_recon = [T.fvector("Xs_recon")] hiddens_R_input = [X_recon] + [T.fvector(name="h_recon_" + str(i + 1)) for i in range(layers)] hiddens_R_output = hiddens_R_input[:1] + hiddens_R_input[1:] # The layer update scheme print "Creating graph for noisy reconstruction function at checkpoints during training." p_X_chain_R, H_chain_R = build_graph(hiddens_R_output, Xs_recon, noisy=False) # choose the correct output from H_chain for hidden_outputs based on batch_size and walkbacks # choose the correct output for hiddens_output h_empty = [True if h is None else False for h in H_chain_R] if False in h_empty: # if there was a set of hiddens output from the batch_size-1 element of the chain hiddens_R_output = H_chain_R[ h_empty.index(False)] # extract out the not-None element from the list if it exists # if state.batch_size <= len(Xs_recon): # for i in range(len(hiddens_R_output)): # hiddens_R_output[i] = H_chain_R[state.batch_size - 1][i] f_recon = theano.function(inputs=hiddens_R_input + Xs_recon, outputs=hiddens_R_output + [p_X_chain_R[0], p_X_chain_R[-1]], on_unused_input="warn") ############ # Sampling # ############ # the input to the sampling function X_sample = T.fmatrix("X_sampling") network_state_input = [X_sample] + [T.fmatrix("H_sampling_" + str(i + 1)) for i in range(layers)] # "Output" state of the network (noisy) # initialized with input, then we apply updates network_state_output = [X_sample] + network_state_input[1:] visible_pX_chain = [] # ONE update print "Performing one walkback in network state sampling." _ = update_layers(network_state_output, visible_pX_chain, [X_sample], 0, noisy=True) if layers == 1: f_sample_simple = theano.function(inputs=[X_sample], outputs=visible_pX_chain[-1]) # WHY IS THERE A WARNING???? # because the first odd layers are not used -> directly computed FROM THE EVEN layers # unused input = warn f_sample2 = theano.function(inputs=network_state_input, outputs=network_state_output + visible_pX_chain, on_unused_input='warn') def sample_some_numbers_single_layer(): x0 = test_X.get_value()[:1] samples = [x0] x = f_noise(x0) for i in range(399): x = f_sample_simple(x) samples.append(x) x = numpy.random.binomial(n=1, p=x, size=x.shape).astype('float32') x = f_noise(x) return numpy.vstack(samples) def sampling_wrapper(NSI): # * is the "splat" operator: It takes a list as input, and expands it into actual positional arguments in the function call. out = f_sample2(*NSI) NSO = out[:len(network_state_output)] vis_pX_chain = out[len(network_state_output):] return NSO, vis_pX_chain def sample_some_numbers(N=400): # The network's initial state init_vis = test_X.get_value()[:1] noisy_init_vis = f_noise(init_vis) network_state = [ [noisy_init_vis] + [numpy.zeros((1, len(b.get_value())), dtype='float32') for b in bias_list[1:]]] visible_chain = [init_vis] noisy_h0_chain = [noisy_init_vis] for i in range(N - 1): # feed the last state into the network, compute new state, and obtain visible units expectation chain net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1]) # append to the visible chain visible_chain += vis_pX_chain # append state output to the network state chain network_state.append(net_state_out) noisy_h0_chain.append(net_state_out[0]) return numpy.vstack(visible_chain), numpy.vstack(noisy_h0_chain) def plot_samples(epoch_number, iteration): to_sample = time.time() if layers == 1: # one layer model V = sample_some_numbers_single_layer() else: V, H0 = sample_some_numbers() img_samples = PIL.Image.fromarray(tile_raster_images(V, (root_N_input, root_N_input), (20, 20))) fname = outdir + 'samples_iteration_' + str(iteration) + '_epoch_' + str(epoch_number) + '.png' img_samples.save(fname) print 'Took ' + str(time.time() - to_sample) + ' to sample 400 numbers' ############## # Inpainting # ############## def inpainting(digit): # The network's initial state # NOISE INIT init_vis = cast32(numpy.random.uniform(size=digit.shape)) # noisy_init_vis = f_noise(init_vis) # noisy_init_vis = cast32(numpy.random.uniform(size=init_vis.shape)) # INDEXES FOR VISIBLE AND NOISY PART noise_idx = (numpy.arange(N_input) % root_N_input < (root_N_input / 2)) fixed_idx = (numpy.arange(N_input) % root_N_input > (root_N_input / 2)) # function to re-init the visible to the same noise # FUNCTION TO RESET HALF VISIBLE TO DIGIT def reset_vis(V): V[0][fixed_idx] = digit[0][fixed_idx] return V # INIT DIGIT : NOISE and RESET HALF TO DIGIT init_vis = reset_vis(init_vis) network_state = [[init_vis] + [numpy.zeros((1, len(b.get_value())), dtype='float32') for b in bias_list[1:]]] visible_chain = [init_vis] noisy_h0_chain = [init_vis] for i in range(49): # feed the last state into the network, compute new state, and obtain visible units expectation chain net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1]) # reset half the digit net_state_out[0] = reset_vis(net_state_out[0]) vis_pX_chain[0] = reset_vis(vis_pX_chain[0]) # append to the visible chain visible_chain += vis_pX_chain # append state output to the network state chain network_state.append(net_state_out) noisy_h0_chain.append(net_state_out[0]) return numpy.vstack(visible_chain), numpy.vstack(noisy_h0_chain) def save_params_to_file(name, n, params, iteration): print 'saving parameters...' save_path = outdir + name + '_params_iteration_' + str(iteration) + '_epoch_' + str(n) + '.pkl' f = open(save_path, 'wb') try: cPickle.dump(params, f, protocol=cPickle.HIGHEST_PROTOCOL) finally: f.close() ################ # GSN TRAINING # ################ def train_recurrent_GSN(iteration, train_X, train_Y, valid_X, valid_Y, test_X, test_Y): print '----------------------------------------' print 'TRAINING GSN FOR ITERATION', iteration with open(logfile, 'a') as f: f.write("--------------------------\nTRAINING GSN FOR ITERATION {0!s}\n".format(iteration)) # TRAINING n_epoch = state.n_epoch batch_size = state.batch_size STOP = False counter = 0 if iteration == 0: learning_rate.set_value(cast32(state.learning_rate)) # learning rate times = [] best_cost = float('inf') patience = 0 print 'learning rate:', learning_rate.get_value() print 'train X size:', str(train_X.shape.eval()) print 'valid X size:', str(valid_X.shape.eval()) print 'test X size:', str(test_X.shape.eval()) train_costs = [] valid_costs = [] test_costs = [] train_costs_post = [] valid_costs_post = [] test_costs_post = [] if state.vis_init: bias_list[0].set_value(logit(numpy.clip(0.9, 0.001, train_X.get_value().mean(axis=0)))) if state.test_model: # If testing, do not train and go directly to generating samples, parzen window estimation, and inpainting print 'Testing : skip training' STOP = True while not STOP: counter += 1 t = time.time() print counter, '\t', with open(logfile, 'a') as f: f.write("{0!s}\t".format(counter)) # shuffle the data data.sequence_mnist_data(train_X, train_Y, valid_X, valid_Y, test_X, test_Y, dataset, rng) # train # init hiddens # hiddens = [(T.zeros_like(train_X[:batch_size]).eval())] # for i in range(len(weights_list)): # # init with zeros # hiddens.append(T.zeros_like(T.dot(hiddens[i], weights_list[i])).eval()) hiddens = [T.zeros((batch_size, layer_size)).eval() for layer_size in layer_sizes] train_cost = [] train_cost_post = [] for i in range(len(train_X.get_value(borrow=True)) / batch_size): xs = [train_X.get_value(borrow=True)[ (i * batch_size) + sequence_idx: ((i + 1) * batch_size) + sequence_idx] for sequence_idx in range(len(Xs))] xs, hiddens = fix_input_size(xs, hiddens) hiddens[0] = xs[0] _ins = hiddens + xs _outs = f_learn(*_ins) hiddens = _outs[:len(hiddens)] cost = _outs[-2] cost_post = _outs[-1] train_cost.append(cost) train_cost_post.append(cost_post) train_cost = numpy.mean(train_cost) train_costs.append(train_cost) train_cost_post = numpy.mean(train_cost_post) train_costs_post.append(train_cost_post) print 'Train : ', trunc(train_cost), trunc(train_cost_post), '\t', with open(logfile, 'a') as f: f.write("Train : {0!s} {1!s}\t".format(trunc(train_cost), trunc(train_cost_post))) with open(train_convergence_pre, 'a') as f: f.write("{0!s},".format(train_cost)) with open(train_convergence_post, 'a') as f: f.write("{0!s},".format(train_cost_post)) # valid # init hiddens hiddens = [T.zeros((batch_size, layer_size)).eval() for layer_size in layer_sizes] valid_cost = [] valid_cost_post = [] for i in range(len(valid_X.get_value(borrow=True)) / batch_size): xs = [valid_X.get_value(borrow=True)[ (i * batch_size) + sequence_idx: ((i + 1) * batch_size) + sequence_idx] for sequence_idx in range(len(Xs))] xs, hiddens = fix_input_size(xs, hiddens) hiddens[0] = xs[0] _ins = hiddens + xs _outs = f_cost(*_ins) hiddens = _outs[:-2] cost = _outs[-2] cost_post = _outs[-1] valid_cost.append(cost) valid_cost_post.append(cost_post) valid_cost = numpy.mean(valid_cost) valid_costs.append(valid_cost) valid_cost_post = numpy.mean(valid_cost_post) valid_costs_post.append(valid_cost_post) print 'Valid : ', trunc(valid_cost), trunc(valid_cost_post), '\t', with open(logfile, 'a') as f: f.write("Valid : {0!s} {1!s}\t".format(trunc(valid_cost), trunc(valid_cost_post))) with open(valid_convergence_pre, 'a') as f: f.write("{0!s},".format(valid_cost)) with open(valid_convergence_post, 'a') as f: f.write("{0!s},".format(valid_cost_post)) # test # init hiddens hiddens = [T.zeros((batch_size, layer_size)).eval() for layer_size in layer_sizes] test_cost = [] test_cost_post = [] for i in range(len(test_X.get_value(borrow=True)) / batch_size): xs = [test_X.get_value(borrow=True)[ (i * batch_size) + sequence_idx: ((i + 1) * batch_size) + sequence_idx] for sequence_idx in range(len(Xs))] xs, hiddens = fix_input_size(xs, hiddens) hiddens[0] = xs[0] _ins = hiddens + xs _outs = f_cost(*_ins) hiddens = _outs[:-2] cost = _outs[-2] cost_post = _outs[-1] test_cost.append(cost) test_cost_post.append(cost_post) test_cost = numpy.mean(test_cost) test_costs.append(test_cost) test_cost_post = numpy.mean(test_cost_post) test_costs_post.append(test_cost_post) print 'Test : ', trunc(test_cost), trunc(test_cost_post), '\t', with open(logfile, 'a') as f: f.write("Test : {0!s} {1!s}\t".format(trunc(test_cost), trunc(test_cost_post))) with open(test_convergence_pre, 'a') as f: f.write("{0!s},".format(test_cost)) with open(test_convergence_post, 'a') as f: f.write("{0!s},".format(test_cost_post)) # check for early stopping cost = train_cost if cost < best_cost * state.early_stop_threshold: patience = 0 best_cost = cost else: patience += 1 if counter >= n_epoch or patience >= state.early_stop_length: STOP = True save_params_to_file('gsn', counter, params, iteration) timing = time.time() - t times.append(timing) print 'time : ', trunc(timing), print 'remaining: ', trunc((n_epoch - counter) * numpy.mean(times) / 60 / 60), 'hrs', print 'B : ', [trunc(abs(b.get_value(borrow=True)).mean()) for b in bias_list], print 'W : ', [trunc(abs(w.get_value(borrow=True)).mean()) for w in weights_list], print 'V : ', [trunc(abs(v.get_value(borrow=True)).mean()) for v in recurrent_weights_list] with open(logfile, 'a') as f: f.write("MeanVisB : {0!s}\t".format(trunc(bias_list[0].get_value().mean()))) with open(logfile, 'a') as f: f.write("W : {0!s}\t".format(str([trunc(abs(w.get_value(borrow=True)).mean()) for w in weights_list]))) with open(logfile, 'a') as f: f.write("Time : {0!s} seconds\n".format(trunc(timing))) if (counter % state.save_frequency) == 0: # Checking reconstruction nums = test_X.get_value()[range(100)] noisy_nums = f_noise(test_X.get_value()[range(100)]) reconstructed_prediction = [] reconstructed_prediction_end = [] # init reconstruction hiddens hiddens = [T.zeros(layer_size).eval() for layer_size in layer_sizes] for num in noisy_nums: hiddens[0] = num for i in range(len(hiddens)): if len(hiddens[i].shape) == 2 and hiddens[i].shape[0] == 1: hiddens[i] = hiddens[i][0] _ins = hiddens + [num] _outs = f_recon(*_ins) hiddens = _outs[:len(hiddens)] [reconstructed_1, reconstructed_n] = _outs[len(hiddens):] reconstructed_prediction.append(reconstructed_1) reconstructed_prediction_end.append(reconstructed_n) with open(logfile, 'a') as f: f.write("\n") for i in range(len(nums)): if len(reconstructed_prediction[i].shape) == 2 and reconstructed_prediction[i].shape[0] == 1: reconstructed_prediction[i] = reconstructed_prediction[i][0] print nums[i].tolist(), "->", reconstructed_prediction[i].tolist() with open(logfile, 'a') as f: f.write("{0!s} -> {1!s}\n".format(nums[i].tolist(), [trunc(n) if n > 0.0001 else trunc(0.00000000000000000) for n in reconstructed_prediction[i].tolist()])) with open(logfile, 'a') as f: f.write("\n") # # Concatenate stuff # stacked = numpy.vstack([numpy.vstack([nums[i*10 : (i+1)*10], noisy_nums[i*10 : (i+1)*10], reconstructed_prediction[i*10 : (i+1)*10], reconstructed_prediction_end[i*10 : (i+1)*10]]) for i in range(10)]) # numbers_reconstruction = PIL.Image.fromarray(tile_raster_images(stacked, (root_N_input,root_N_input), (10,40))) # numbers_reconstruction.save(outdir+'gsn_number_reconstruction_iteration_'+str(iteration)+'_epoch_'+str(counter)+'.png') # # #sample_numbers(counter, 'seven') # plot_samples(counter, iteration) # # #save params # save_params_to_file('gsn', counter, params, iteration) # ANNEAL! new_lr = learning_rate.get_value() * annealing learning_rate.set_value(new_lr) # 10k samples print 'Generating 10,000 samples' samples, _ = sample_some_numbers(N=10000) f_samples = outdir + 'samples.npy' numpy.save(f_samples, samples) print 'saved digits' ##################### # STORY 2 ALGORITHM # ##################### for iter in range(state.max_iterations): train_recurrent_GSN(iter, train_X, train_Y, valid_X, valid_Y, test_X, test_Y)