def test_shape(): x = T.tensor3() x_flat_2_mat = T.flatten(x, 2) x_flat_2_vec = T.flatten(x, 1) flat_f = theano.function([x], [x_flat_2_mat, x_flat_2_vec]) flat_mat_val, flat_vec_val = flat_f(tensor3_val) print 'flatten to 2-d array:' print flat_mat_val print 'flatten to 1-d array:' print flat_vec_val x_mat = T.matrix() x_mat_2_t3 = T.reshape(x_mat, (2, 2, 2)) x_mat_2_vec = T.reshape(x_mat, (8,)) reshape_f = theano.function([x_mat], [x_mat_2_t3, x_mat_2_vec]) """ t3_shape = T.lvector() vec_shape = T.lvector() x_mat_2_t3 = T.reshape(x_mat, t3_shape, 3) x_mat_2_vec = T.reshape(x_mat, vec_shape, 1) reshape_f = theano.function([x_mat, t3_shape, vec_shape], [x_mat_2_t3, x_mat_2_vec]) """ mat_2_t3_val, mat_2_vec_val = reshape_f(flat_mat_val) print 'reshape 2-d array to 3-d array:' print mat_2_t3_val print 'reshape 2-d array to 1-d array:' print mat_2_vec_val
def loop(i, x, p, t): p_class_t = p[i, t[i]] return T.dot( T.flatten(T.grad(p_class_t, x)[i]), T.flatten(x[i]) )
def build_model(tparams, options, Wemb): trng = RandomStreams(123) use_noise = theano.shared(numpy_floatX(0.)) x = T.matrix('x', dtype='int32') t = T.matrix('t', dtype=config.floatX) mask = T.matrix('mask', dtype=config.floatX) y = T.vector('y', dtype='int32') n_timesteps = x.shape[0] n_samples = x.shape[1] x_emb = Wemb[x.flatten()].reshape([n_timesteps,n_samples,options['embDimSize']]) x_t_emb = T.concatenate([t.reshape([n_timesteps,n_samples,1]), x_emb], axis=2) #Adding the time element to the embedding proj = gru_layer(tparams, x_t_emb, options, mask=mask) if options['use_dropout']: proj = dropout_layer(proj, use_noise, trng) p_y_given_x = T.nnet.sigmoid(T.dot(proj, tparams['W_logistic']) + tparams['b_logistic']) L = -(y * T.flatten(T.log(p_y_given_x)) + (1 - y) * T.flatten(T.log(1 - p_y_given_x))) cost = T.mean(L) if options['L2_reg'] > 0.: cost += options['L2_reg'] * (tparams['W_logistic'] ** 2).sum() return use_noise, x, t, mask, y, p_y_given_x, cost
def unet_crossentropy_loss_sampled(y_true, y_pred): print 'unet_crossentropy_loss_sampled' epsilon = 1.0e-4 y_pred_clipped = T.flatten(T.clip(y_pred, epsilon, 1.0-epsilon)) y_true = T.flatten(y_true) # this seems to work # it is super ugly though and I am sure there is a better way to do it # but I am struggling with theano to cooperate # filter the right indices indPos = T.nonzero(y_true)[0] # no idea why this is a tuple indNeg = T.nonzero(1-y_true)[0] # shuffle n = indPos.shape[0] indPos = indPos[srng.permutation(n=n)] n = indNeg.shape[0] indNeg = indNeg[srng.permutation(n=n)] # take equal number of samples depending on which class has less n_samples = T.cast(T.min([T.sum(y_true), T.sum(1-y_true)]), dtype='int64') indPos = indPos[:n_samples] indNeg = indNeg[:n_samples] loss_vector = -T.mean(T.log(y_pred_clipped[indPos])) - T.mean(T.log(1-y_pred_clipped[indNeg])) average_loss = T.mean(loss_vector) print 'average_loss:', average_loss return average_loss
def apply(self, dataset, can_fit=True): x = dataset.get_design_matrix() denseX = T.matrix(dtype=x.dtype) image_shape = (len(x),) + self.img_shape X = denseX.reshape(image_shape) filters = gaussian_filter_9x9().reshape((1,1,9,9)) convout = conv.conv2d(input = X, filters = filters, image_shape = image_shape, filter_shape = (1, 1, 9, 9), border_mode='full') # For each pixel, remove mean of 9x9 neighborhood centered_X = X - convout[:,:,4:-4,4:-4] # Scale down norm of 9x9 patch if norm is bigger than 1 sum_sqr_XX = conv.conv2d(input = centered_X**2, filters = filters, image_shape = image_shape, filter_shape = (1, 1, 9, 9), border_mode='full') denom = T.sqrt(sum_sqr_XX[:,:,4:-4,4:-4]) per_img_mean = T.mean(T.flatten(denom, outdim=3), axis=2) divisor = T.largest(per_img_mean.dimshuffle((0,1,'x','x')), denom) new_X = centered_X / divisor new_X = T.flatten(new_X, outdim=2) f = theano.function([denseX], new_X) dataset.set_design_matrix(f(x))
def unet_crossentropy_loss_sampled(y_true, y_pred): epsilon = 1.0e-4 y_pred_clipped = T.flatten(T.clip(y_pred, epsilon, 1.0-epsilon)) y_true = T.flatten(y_true) # this seems to work # it is super ugly though and I am sure there is a better way to do it # but I am struggling with theano to cooperate # filter the right indices classPos = 1 classNeg = 0 indPos = T.eq(y_true, classPos).nonzero()[0] indNeg = T.eq(y_true, classNeg).nonzero()[0] #pos = y_true[ indPos ] #neg = y_true[ indNeg ] # shuffle n = indPos.shape[0] indPos = indPos[UNET.srng.permutation(n=n)] n = indNeg.shape[0] indNeg = indNeg[UNET.srng.permutation(n=n)] # take equal number of samples depending on which class has less n_samples = T.cast(T.min([ indPos.shape[0], indNeg.shape[0]]), dtype='int64') #n_samples = T.cast(T.min([T.sum(y_true), T.sum(1-y_true)]), dtype='int64') indPos = indPos[:n_samples] indNeg = indNeg[:n_samples] #loss_vector = -T.mean(T.log(y_pred_clipped[indPos])) - T.mean(T.log(1-y_pred_clipped[indNeg])) loss_vector = -T.mean(T.log(y_pred_clipped[indPos])) - T.mean(T.log(y_pred_clipped[indNeg])) loss_vector = T.clip(loss_vector, epsilon, 1.0-epsilon) average_loss = T.mean(loss_vector) if T.isnan(average_loss): average_loss = T.mean( y_pred_clipped[indPos]) return average_loss
def _recurrence(v_h_, x_h_, v_t_, x_t_, a_t_, is_aggressive): state = tt.concatenate([v_h_, x_h_, tt.flatten(v_t_), tt.flatten(x_t_), tt.flatten(a_t_)]) h0 = tt.dot(state, self.W_a_0) + self.b_a_0 relu0 = tt.nnet.relu(h0) h1 = tt.dot(relu0, self.W_a_1) + self.b_a_1 relu1 = tt.nnet.relu(h1) h2 = tt.dot(relu1, self.W_a_2) + self.b_a_2 relu2 = tt.nnet.relu(h2) a = tt.dot(relu2, self.W_a_c) v_h, x_h, v_t, x_t, a_t, cost_transition = _step_state(v_h_, x_h_, v_t_, x_t_, a_t_, a, is_aggressive) # cost: # 0. smooth acceleration policy cost_accel = tt.abs_(a) # 1. forcing the host to move forward (until the top point of the roundabout) cost_progress = tt.nnet.relu(0.5*self.two_pi_r-x_h) # 2. keeping distance from close vehicles x_abs_diffs = tt.abs_(x_h - x_t) cost_accident = tt.mean(3*tt.nnet.relu( self.require_distance-x_abs_diffs )) * (x_h > - 0.5*self.host_length) #tt.nnet.sigmoid(x_h + 0.5*self.host_length) cost = self.alpha_accel * cost_accel + self.alpha_progress * cost_progress + self.alpha_accident * cost_accident return (v_h, x_h, v_t, x_t, a_t, cost, cost_transition), t.scan_module.until(x_h[0]>=0.45*self.two_pi_r)
def __call__(self, x, leak): f1 = 0.5 * (1 + leak) f2 = 0.5 * (1 - leak) if leak.ndim == 1: return T.flatten(f1, 1)[0] * x + T.flatten(f2, 1)[0] * abs(x) else: return f1 * x + f2 * abs(x)
def __create_node_set(self, n_features, n_output, data_in, note_set_name, weightsFunc = None,): prev_out = data_in prev_dim = n_features layers = [] n_weights = 0 weights_list = [] state = None for i_h_layer in range(0,len(self.hidden_dimensions)): n_hidden_nodes = self.hidden_dimensions[i_h_layer] weights = None #weights = np.ones((prev_dim,n_hidden_nodes)) - 0.5 bias = None if weightsFunc is not None: weights,bias,state = weightsFunc(i_h_layer,state) #acutal hidden layer hidden_layer = Layer(data_in=prev_out, n_input=prev_dim, n_output=n_hidden_nodes, link_function=self.link_function_hidden, weights=weights, bias=bias, name=note_set_name + " Hidden Layer") weights_list.append(hidden_layer.weights) weights_list.append(hidden_layer.bias) layers.append(hidden_layer) n_weights += (prev_dim+1)*n_hidden_nodes prev_out = hidden_layer.output prev_dim = n_hidden_nodes weights = None #weights = np.ones((prev_dim,n_output)) - 0.5 bias = None if weightsFunc is not None: weights,bias,state = weightsFunc(len(self.hidden_dimensions),state) output_layer = Layer( data_in=prev_out, n_input=prev_dim, n_output=n_output, link_function=self.link_function_output, weights=weights, bias=bias, name=note_set_name + " Output Layer") weights_list.append(output_layer.weights) weights_list.append(output_layer.bias) layers.append(output_layer) n_weights += (prev_dim+1)*n_output #concatenate weights into one huge vector flat_weights = T.concatenate([T.flatten(item) for item in weights_list]) flat_weights.name = "Network " + note_set_name + " Weights" #compute MSE y = self.__y errors = y - output_layer.output mse = T.mean(T.sqr(errors)) normalized_mse = mse / 2.0 normalized_mse.name = note_set_name + " MSE" grads = T.concatenate([T.flatten(item) for item in T.grad(normalized_mse, weights_list)]) grads.name = note_set_name + " Gradients" return layers,grads,normalized_mse,weights_list, n_weights, flat_weights
def model(X1, X2, w1, w2, w3, p_drop_conv): # first half of the first layer l1a = T.flatten(dropout(T.mean(rectify(conv2d(X1, w1, border_mode='valid')), axis=3), p_drop_conv), outdim=2) # second half of the first layer l1b = T.flatten(dropout(T.mean(rectify(conv2d(X2, w2, border_mode='valid')), axis=3), p_drop_conv), outdim=2) # combine two pars as first layer l1 = T.concatenate([l1a, l1b], axis=1) # combine two pars as first layer pyx = T.dot(l1, w3) return pyx
def lower_bound(self): mu = T.flatten(self.trunc_output, outdim=2) inp = T.flatten(self.inpt, outdim=2) if self.out_distribution == True: sigma = T.mean(T.flatten(self.trunk_sigma, outdim=2)) else: sigma = 0 # log_gauss = 0.5*np.log(2 * np.pi) + 0.5*sigma + 0.5 * ((inp - mu) / T.exp(sigma))**2. log_gauss = T.sum(0.5 * np.log(2 * np.pi) + 0.5 * sigma + 0.5 * ((inp - mu) / T.exp(sigma)) ** 2.0, axis=1) return T.mean(log_gauss - self.latent_layer.prior)
def t_unroll_ae(wts, bs, tied_wts=False): ''' Flattens matrices and concatenates to a vector - specifically for autoencoders ''' # if we have tied weights, this vector will be comprised of a single matrix and two # distinct bias vectors if tied_wts: v = np.array([], type=theano.config.floatX) v = T.concatenate( (v, T.flatten(wts[0]), T.flatten(bs[0]), T.flatten(bs[1]))) return v return t_unroll(wts, bs)
def model(X, h2_u, h3_u, h2_s, h3_s, w, w2, g2, b2, w3, g3, b3, wy ): h = lrelu(dnn_conv(X, w, subsample=(2, 2), border_mode=(2, 2))) h2 = lrelu(batchnorm(dnn_conv(h, w2, subsample=(2, 2), border_mode=(2, 2)), g=g2, b=b2, u=h2_u, s=h2_s)) h3 = lrelu(batchnorm(dnn_conv(h2, w3, subsample=(2, 2), border_mode=(2, 2)), g=g3, b=b3, u=h3_u, s=h3_s)) h = T.flatten(dnn_pool(h, (4, 4), (4, 4), mode='max'), 2) h2 = T.flatten(dnn_pool(h2, (2, 2), (2, 2), mode='max'), 2) h3 = T.flatten(dnn_pool(h3, (1, 1), (1, 1), mode='max'), 2) f = T.concatenate([h, h2, h3], axis=1) return [f]
def model(X, w1, w2, w3, p_drop_conv, p_drop_hidden): l1a = rectify(conv2d(X, w1, border_mode='full')) l1 = max_pool_2d(l1a, (2, 2)) l1 = dropout(l1, p_drop_conv) dropout(T.flatten(max_pool_2d(rectify(conv2d(X, w2)), (2,2)), outdim=2), 0.3) l2a = rectify(conv2d(l1, w2)) l2b = max_pool_2d(l2a, (2, 2)) l2 = T.flatten(l2b, outdim=2) l2 = dropout(l2, p_drop_conv) pyx = softmax(T.dot(l2, w3)) return l1, l2, pyx
def set_sampling_function(decoder_feature_function, decoder_red_function, decoder_green_function, decoder_blue_function): hidden_data = T.matrix(name='hidden_data', dtype=theano.config.floatX) # decoder decoder_outputs = decoder_feature_function(hidden_data) decoder_feature = decoder_outputs[1] decoder_red = decoder_red_function(decoder_feature) decoder_green = decoder_green_function(decoder_feature) decoder_blue = decoder_blue_function(decoder_feature) num_samples = decoder_red.shape[0] num_rows = decoder_red.shape[2] num_cols = decoder_red.shape[3] num_pixels = num_rows*num_cols # shape = (num_samples, num_intensity, num_pixels) decoder_red = T.flatten(decoder_red, 3) decoder_green = T.flatten(decoder_green, 3) decoder_blue = T.flatten(decoder_blue, 3) # shape = (num_samples, num_pixels, num_intensity) decoder_red = T.swapaxes(decoder_red, axis1=1, axis2=2) decoder_green = T.swapaxes(decoder_green, axis1=1, axis2=2) decoder_blue = T.swapaxes(decoder_blue, axis1=1, axis2=2) # shape = (num_samples*num_pixels, num_intensity) decoder_red = decoder_red.reshape((num_samples*num_pixels, -1)) decoder_green = decoder_green.reshape((num_samples*num_pixels, -1)) decoder_blue = decoder_blue.reshape((num_samples*num_pixels, -1)) # softmax decoder_red = T.argmax(T.nnet.softmax(decoder_red),axis=1) decoder_green = T.argmax(T.nnet.softmax(decoder_green),axis=1) decoder_blue = T.argmax(T.nnet.softmax(decoder_blue),axis=1) decoder_red = decoder_red.reshape((num_samples, 1, num_rows, num_cols)) decoder_green = decoder_green.reshape((num_samples, 1, num_rows, num_cols)) decoder_blue = decoder_blue.reshape((num_samples, 1, num_rows, num_cols)) decoder_image = T.concatenate([decoder_red, decoder_green, decoder_blue], axis=1) function_inputs = [hidden_data,] function_outputs = [decoder_image,] function = theano.function(inputs=function_inputs, outputs=function_outputs, on_unused_input='ignore') return function
def gauss_style_loss(x_truth, x_guess, log_var=0., scale=1., use_huber=False): # compute gram matrices for the two batches of convolutional features g_t = T.flatten(gram_matrix(x_truth), 2) g_g = T.flatten(gram_matrix(x_guess), 2) # get normalization factors based on the size of feature maps # N = T.cast(x_truth.shape[1], 'floatX') # M = T.cast(x_truth.shape[2] * x_truth.shape[3], 'floatX') # compute a pseudo-Gaussian loss on difference between gram matrices loss = log_prob_gaussian(g_t, g_g, log_vars=log_var, do_sum=False, use_huber=use_huber, mask=None) # take sum over gram matrix entries and normalize for feature map size # loss = (scale / (N**2. * M)) * T.sum(loss, axis=1, keepdims=False) loss = T.sum(loss, axis=1, keepdims=False) return loss
def L2SVMcost(self, y): """Return the mean of the negative log-likelihood of the prediction of this model under a given target distribution. .. math:: \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ \ell (\theta=\{W,b\}, \mathcal{D}) :type y: theano.tensor.TensorType :param y: corresponds to a vector that gives for each example the correct label Note: we use the mean instead of the sum so that the learning rate is less dependent on the batch size """ '''p = -T.ones_like((y.shape[0],7)) result, updates = theano.scan(fn = lambda p,y: T.basic.set_subtensor(p[i,y[i]]=1), outputs_info = -T.ones_like((y.shape[0],7)), non_sequences = y, n_steps = y.shape[0]) final_result = result[-1] f = theano.function([y,p],final_result,updates = updates) for i in xrange(500): p = T.basic.set_subtensor(p[i,y[i]]=1) print p.shape print f(y,p) print f(y,p).shape''' # y.shape[0] is (symbolically) the number of rows in y, i.e., # number of examples (call it n) in the minibatch # T.arange(y.shape[0]) is a symbolic vector which will contain # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of # Log-Probabilities (call it LP) with one row per example and # one column per class LP[T.arange(y.shape[0]),y] is a vector # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is # the mean (across minibatch examples) of the elements in v, # i.e., the mean log-likelihood across the minibatch. z = 0.5*T.dot( T.flatten(self.W,outdim=1), T.flatten(self.W, outdim=1)) + 0.5*T.dot( T.flatten(self.b,outdim=1), T.flatten(self.b, outdim=1)) +0.6* T.sum(T.maximum(0,(1-self.p_y_given_x *y)),axis=1).mean() #zk = theano.tensor.scalar('zk') #zp = theano.printing.Print('this is a very important value')(zk) #f = theano.function([zk],zp) #z = theano.shared(z) #f(z) return z
def create_model(): """Create the deep autoencoder model with Blocks, and load MNIST.""" mlp = MLP(activations=[Logistic(), Logistic(), Logistic(), None, Logistic(), Logistic(), Logistic(), Logistic()], dims=[784, 1000, 500, 250, 30, 250, 500, 1000, 784], weights_init=Sparse(15, IsotropicGaussian()), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') x_hat = mlp.apply(tensor.flatten(x, outdim=2)) squared_err = SquaredError().apply(tensor.flatten(x, outdim=2), x_hat) cost = BinaryCrossEntropy().apply(tensor.flatten(x, outdim=2), x_hat) return x, cost, squared_err
def set_updater_function(feature_extractor, sample_generator, generator_parameters, generator_optimizer): # set input data, hidden data input_data = T.tensor4(name='input_data', dtype=theano.config.floatX) hidden_data = T.matrix(name='hidden_data', dtype=theano.config.floatX) # extract feature from input data positive_features = feature_extractor(input_data) # sample data negative_features = sample_generator(hidden_data) negative_data = negative_features[-1] negative_features = negative_features[:-1] # moment matching moment_match_cost = 0 for i in xrange(len(positive_features)): pos_feat = positive_features[i] neg_feat = negative_features[i] moment_match_cost += T.mean(T.sqr(T.mean(pos_feat, axis=0)-T.mean(neg_feat, axis=0))) moment_match_cost += T.mean(T.sqr(T.mean(T.sqr(pos_feat), axis=0)-T.mean(T.sqr(neg_feat), axis=0))) pos_feat = T.flatten(input_data, 2) neg_feat = T.flatten(negative_data, 2) moment_match_cost += T.mean(T.sqr(T.mean(pos_feat, axis=0)-T.mean(neg_feat, axis=0))) moment_match_cost += T.mean(T.sqr(T.mean(T.sqr(pos_feat), axis=0)-T.mean(T.sqr(neg_feat), axis=0))) generator_updates = generator_optimizer(generator_parameters, moment_match_cost) # updater function input updater_function_inputs = [input_data, hidden_data] # updater function output updater_function_outputs = [moment_match_cost, negative_data] # updater function updater_function = theano.function(inputs=updater_function_inputs, outputs=updater_function_outputs, updates=generator_updates, on_unused_input='ignore') return updater_function
def jacobian_mul_vector_l_flat(y, x, W, v, x_val, W_val, v_val): J = theano.gradient.jacobian(y, x) J_flat = T.flatten(J, J.ndim - 1) # The jacobian result on flattened matrix x VJ = v.dot(J_flat) VJ_reshape = T.reshape(VJ, T.shape(x)) f_VJ = theano.function([x, W, v], VJ_reshape) return f_VJ(x_val, W_val, v_val)
def apply(self, dataset, can_fit=True): x = dataset.get_design_matrix() denseX = T.matrix(dtype=x.dtype) image_shape = (len(x),) + self.img_shape X = denseX.reshape(image_shape) ones_patch = T.ones((1,1,9,9), dtype=x.dtype) convout = conv.conv2d(input = X, filters = ones_patch / (9.*9.), image_shape = image_shape, filter_shape = (1, 1, 9, 9), border_mode='full') # For each pixel, remove mean of 3x3 neighborhood centered_X = X - convout[:,:,4:-4,4:-4] # Scale down norm of 3x3 patch if norm is bigger than 1 sum_sqr_XX = conv.conv2d(input = centered_X**2, filters = ones_patch, image_shape = image_shape, filter_shape = (1, 1, 9, 9), border_mode='full') denom = T.sqrt(sum_sqr_XX[:,:,4:-4,4:-4]) xdenom = denom.reshape(X.shape) new_X = centered_X / T.largest(1.0, xdenom) new_X = T.flatten(new_X, outdim=2) f = theano.function([denseX], new_X) dataset.set_design_matrix(f(x))
def model(X, w1, w2, w3, Max_Pooling_Shape, p_drop_conv, p_drop_hidden): l1 = T.flatten( dropout(max_pool_2d(rectify(conv2d(X, w1, border_mode="valid")), Max_Pooling_Shape), p_drop_conv), outdim=2 ) l2 = dropout(rectify(T.dot(l1, w2)), p_drop_hidden) pyx = softmax(T.dot(l2, w3)) return pyx
def model_conv( X, w_1, w_2, w_3, w_h2, w_o, p_use_input, p_use_hidden ): X = dropout(X, p_use_input) # first convolutional layer: conv_layer_1 = rectify( T.nnet.conv2d(X, w_1, border_mode = 'full' )) sub_layer_1 = T.signal.downsample.max_pool_2d(conv_layer_1, (2, 2) ) out_1 = dropout(sub_layer_1, p_use_input) # second convolutional layer: conv_layer_2 = rectify( T.nnet.conv2d(out_1, w_2) ) sub_layer_2 = T.signal.downsample.max_pool_2d(conv_layer_2, (2, 2) ) out_2 = dropout(sub_layer_2, p_use_hidden) # third convolutional layer: conv_layer_3 = rectify( T.nnet.conv2d(out_2, w_3) ) sub_layer_3 = T.signal.downsample.max_pool_2d(conv_layer_3, (2, 2) ) out_3 = dropout(sub_layer_3, p_use_hidden) out_3 = T.flatten(out_3, outdim = 2) h2 = rectify(T.dot(out_3, w_h2)) h2 = dropout(h2, p_use_hidden) # output layer, activation function = softmax py_x = softmax(T.dot(h2, w_o)) return out_1, out_2, out_3, h2, py_x
def test_log1msigm_to_softplus(self): x = T.matrix() out = T.log(1 - sigmoid(x)) f = theano.function([x], out, mode=self.m) topo = f.maker.fgraph.toposort() assert len(topo) == 2 assert isinstance(topo[0].op.scalar_op, theano.tensor.nnet.sigm.ScalarSoftplus) assert isinstance(topo[1].op.scalar_op, theano.scalar.Neg) f(numpy.random.rand(54, 11).astype(config.floatX)) # Same test with a flatten out = T.log(1 - T.flatten(sigmoid(x))) f = theano.function([x], out, mode=self.m) topo = f.maker.fgraph.toposort() assert len(topo) == 3 assert isinstance(topo[0].op, T.Flatten) assert isinstance(topo[1].op.scalar_op, theano.tensor.nnet.sigm.ScalarSoftplus) assert isinstance(topo[2].op.scalar_op, theano.scalar.Neg) f(numpy.random.rand(54, 11).astype(config.floatX)) # Same test with a reshape out = T.log(1 - sigmoid(x).reshape([x.size])) f = theano.function([x], out, mode=self.m) topo = f.maker.fgraph.toposort() #assert len(topo) == 3 assert any(isinstance(node.op, T.Reshape) for node in topo) assert any(isinstance(getattr(node.op, 'scalar_op', None), theano.tensor.nnet.sigm.ScalarSoftplus) for node in topo) f(numpy.random.rand(54, 11).astype(config.floatX))
def _flatten_1d_or_2d(v): if v.ndim > 2: return T.flatten(v, outdim=2) elif 1 <= v.ndim <= 2: return v else: raise ValueError
def build_objective(model, deterministic=False, epsilon=1e-12): predictions = nn.layers.get_output(model.l_out, deterministic=deterministic) targets = T.cast(T.flatten(nn.layers.get_output(model.l_target)), 'int32') p = predictions[T.arange(predictions.shape[0]), targets] p = T.clip(p, epsilon, 1.) loss = T.mean(T.log(p)) return -loss
def feature_extractor(input_data): # conv stage 0 (64x64=>32x32) h0_0 = dnn_conv(input_data, conv_w0_0, border_mode=(1, 1)) + conv_b0_0.dimshuffle("x", 0, "x", "x") h0_1 = dnn_conv(relu(h0_0), conv_w0_1, border_mode=(1, 1)) + conv_b0_1.dimshuffle("x", 0, "x", "x") h0 = dnn_pool(relu(h0_1), ws=(2, 2), stride=(2, 2)) # conv stage 1 (32x32=>16x16) h1_0 = dnn_conv(h0, conv_w1_0, border_mode=(1, 1)) + conv_b1_0.dimshuffle("x", 0, "x", "x") h1_1 = dnn_conv(relu(h1_0), conv_w1_1, border_mode=(1, 1)) + conv_b1_1.dimshuffle("x", 0, "x", "x") h1 = dnn_pool(relu(h1_1), ws=(2, 2), stride=(2, 2)) # conv stage 2 (16x16=>8x8) h2_0 = dnn_conv(h1, conv_w2_0, border_mode=(1, 1)) + conv_b2_0.dimshuffle("x", 0, "x", "x") h2_1 = dnn_conv(relu(h2_0), conv_w2_1, border_mode=(1, 1)) + conv_b2_1.dimshuffle("x", 0, "x", "x") h2_2 = dnn_conv(relu(h2_1), conv_w2_2, border_mode=(1, 1)) + conv_b2_2.dimshuffle("x", 0, "x", "x") h2 = dnn_pool(relu(h2_2), ws=(2, 2), stride=(2, 2)) # conv stage 3 (8x8=>4x4) h3_0 = dnn_conv(h2, conv_w3_0, border_mode=(1, 1)) + conv_b3_0.dimshuffle("x", 0, "x", "x") h3_1 = dnn_conv(relu(h3_0), conv_w3_1, border_mode=(1, 1)) + conv_b3_1.dimshuffle("x", 0, "x", "x") h3_2 = dnn_conv(relu(h3_1), conv_w3_2, border_mode=(1, 1)) + conv_b3_2.dimshuffle("x", 0, "x", "x") h3 = dnn_pool(relu(h3_2), ws=(2, 2), stride=(2, 2)) # conv stage 4 (4x4=>2x2) h4_0 = dnn_conv(h3, conv_w4_0, border_mode=(1, 1)) + conv_b4_0.dimshuffle("x", 0, "x", "x") h4_1 = dnn_conv(relu(h4_0), conv_w4_1, border_mode=(1, 1)) + conv_b4_1.dimshuffle("x", 0, "x", "x") h4_2 = dnn_conv(relu(h4_1), conv_w4_2, border_mode=(1, 1)) + conv_b4_2.dimshuffle("x", 0, "x", "x") h4 = dnn_pool(relu(h4_2), ws=(2, 2), stride=(2, 2)) return T.flatten(h4, 2)
def convolutional_model(X, w_1, w_2, w_3, w_4, w_5, w_6, p_1, p_2, p_3, p_4, p_5): l1 = dropout(T.tanh( max_pool_2d(T.maximum(conv2d(X, w_1, border_mode='full'),0.), (2, 2),ignore_border=True) + b_1.dimshuffle('x', 0, 'x', 'x') ), p_1) l2 = dropout(T.tanh( max_pool_2d(T.maximum(conv2d(l1, w_2), 0.), (2, 2),ignore_border=True) + b_2.dimshuffle('x', 0, 'x', 'x') ), p_2) l3 = dropout(T.flatten(T.tanh( max_pool_2d(T.maximum(conv2d(l2, w_3), 0.), (2, 2),ignore_border=True) + b_3.dimshuffle('x', 0, 'x', 'x') ), outdim=2), p_3)# flatten to switch back to 1d layers l4 = dropout(T.maximum(T.dot(l3, w_4), 0.), p_4) l5 = dropout(T.maximum(T.dot(l4, w_5), 0.), p_5) return T.dot(l5, w_6)
def lp_norm(self, n, k, r, c, z): ''' Lp = ( 1/n * sum(|x_i|^p, 1..n))^(1/p) where p = 1 + ln(1+e^P) :param n: :param k: :param r: :param c: :param z: :return: ''' ds0, ds1 = self.pool_size st0, st1 = self.stride pad_h = self.pad[0] pad_w = self.pad[1] row_st = r * st0 row_end = T.minimum(row_st + ds0, self.img_rows) row_st = T.maximum(row_st, self.pad[0]) row_end = T.minimum(row_end, self.x_m2d + pad_h) col_st = c * st1 col_end = T.minimum(col_st + ds1, self.img_cols) col_st = T.maximum(col_st, self.pad[1]) col_end = T.minimum(col_end, self.x_m1d + pad_w) Lp = T.pow( T.mean(T.pow( T.abs_(T.flatten(self.y[n, k, row_st:row_end, col_st:col_end], 1)), 1 + T.log(1 + T.exp(self.P)) )), 1 / (1 + T.log(1 + T.exp(self.P))) ) return T.set_subtensor(z[n, k, r, c], Lp)
def model(X, params, featMaps, pieces, pDropConv, pDropHidden): lnum = 0 # conv: (32, 32) pool: (16, 16) layer = conv2d(X, params[lnum][0], border_mode='half') + \ params[lnum][1].dimshuffle('x', 0, 'x', 'x') layer = maxout(layer, featMaps[lnum], pieces[lnum]) layer = pool_2d(layer, (2, 2), st=(2, 2), ignore_border=False, mode='max') layer = basicUtils.dropout(layer, pDropConv) lnum += 1 # conv: (16, 16) pool: (8, 8) layer = conv2d(layer, params[lnum][0], border_mode='half') + \ params[lnum][1].dimshuffle('x', 0, 'x', 'x') layer = maxout(layer, featMaps[lnum], pieces[lnum]) layer = pool_2d(layer, (2, 2), st=(2, 2), ignore_border=False, mode='max') layer = basicUtils.dropout(layer, pDropConv) lnum += 1 # conv: (8, 8) pool: (4, 4) layer = conv2d(layer, params[lnum][0], border_mode='half') + \ params[lnum][1].dimshuffle('x', 0, 'x', 'x') layer = maxout(layer, featMaps[lnum], pieces[lnum]) layer = pool_2d(layer, (2, 2), st=(2, 2), ignore_border=False, mode='max') layer = basicUtils.dropout(layer, pDropConv) lnum += 1 layer = T.flatten(layer, outdim=2) layer = T.dot(layer, params[lnum][0]) + params[lnum][1].dimshuffle('x', 0) layer = relu(layer, alpha=0) layer = basicUtils.dropout(layer, pDropHidden) lnum += 1 layer = T.dot(layer, params[lnum][0]) + params[lnum][1].dimshuffle('x', 0) layer = relu(layer, alpha=0) layer = basicUtils.dropout(layer, pDropHidden) lnum += 1 return softmax(T.dot(layer, params[lnum][0]) + params[lnum][1].dimshuffle('x', 0)) # 如果使用nnet中的softmax训练产生NAN
input_logits = T.ivector('inputs') input_logits.tag.test_value = bk.logits("abbabaabba", input_logits.dtype) xs = T.extra_ops.to_one_hot(input_logits, len(bk.character_set)) target_logits = T.ivector('targets') target_logits.tag.test_value = bk.logits("bbabaabbab", target_logits.dtype) outputs, read_address, memory = (partial_bptt(xs, read_address_with_grads, memory_with_grads, w, output_w)) bptt_cost = softmax_log_likelihood(outputs, target_logits) if l2_param > 0: bptt_cost += l2_param * T.sum((w**2) / 2) # L2 regularization j_read_address_w = T.jacobian(read_address, w) j_memory_w = T.reshape(T.jacobian(T.flatten(memory), w), prev_memory.get_value().shape + w.get_value().shape) # Reshape will make things broadcastable by default, but then updating fails # because shared variables are not broadcastable by default and broadcastable # has to match for a shared var update. We don't want to boradcast anyway. # This is only a problem if the memory depth is 1, so make the depth not BCable j_memory_w = T.unbroadcast(j_memory_w, prev_memory.ndim - 1) update_a_grad = saved_a_grad, j_read_address_w update_m_grad = saved_m_grad, j_memory_w update_address = prev_read_address, read_address update_memory = prev_memory, memory weight_updates = list( lasagne.updates.adadelta(bptt_cost, [w, output_w]).items())
def _recurrence(time_step, x_h_, v_h_, angle_, speed_, t_h_, x_t_, v_t_, a_t_, t_t_, exist, is_leader, x_goal, turn_vec_h, turn_vec_t): # state ''' 1. host 1.1 position (2) - (x,y) coordinates in cross coordinate system 1.2 speed (2) - (v_x,v_y) # 1.3 acceleration (2) - (a_x,a_y) # 1.4 waiting time (1) - start counting on full stop. stop counting when clearing the junction 1.5 x_goal (2) - destination position (indicates different turns) total = 5 2. right lane car 2.1 position (2) - null value = (-1,-1) 2.2 speed (2) - null value = (0,0) 2.3 acceleration (2) - null value = (0,0) 2.4 waiting time (1) - null value = 0 total = 7 3. front lane car 3.1 position (2) 3.2 speed (2) 3.3 acceleration (2) 3.4 waiting time (1) total = 7 4. target 3 4.1 position (2) 4.2 speed (2) 4.3 acceleration (2) 4.4 waiting time (1) total = 7 total = 26 ''' # host_state_vec = tt.concatenate([x_h_, v_h_, t_h_]) ang_spd = tt.stack([angle_, speed_]) host_state_vec = tt.concatenate([x_h_, ang_spd, x_goal]) # target_state_vec = tt.concatenate([tt.flatten(x_t_), tt.flatten(v_t_), tt.flatten(a_t_), tt.flatten(t_t_)]) target_state_vec = tt.concatenate([ tt.flatten(x_t_), tt.flatten(v_t_), tt.flatten(a_t_), is_leader ]) state = tt.concatenate([host_state_vec, target_state_vec]) h0 = tt.dot(state, self.W_0) + self.b_0 relu0 = tt.nnet.relu(h0) h1 = tt.dot(relu0, self.W_1) + self.b_1 relu1 = tt.nnet.relu(h1) h2 = tt.dot(relu1, self.W_2) + self.b_2 relu2 = tt.nnet.relu(h2) a_h = tt.dot(relu2, self.W_c) x_h, v_h, angle, speed, t_h, x_t, v_t, a_t, t_t = _step_state( x_h_, v_h_, angle_, speed_, t_h_, turn_vec_h, x_t_, v_t_, t_t_, turn_vec_t, a_h, exist, time_step) # cost: discount_factor = 0.99**time_step # 0. smooth driving policy cost_steer = discount_factor * a_h[0]**2 cost_accel = discount_factor * a_h[1]**2 # 1. forcing the host to move forward dist_from_goal = tt.mean((x_goal - x_h)**2) cost_progress = discount_factor * dist_from_goal # 2. keeping distance from in front vehicles d_t_h = x_t - x_h h_t_dists = (d_t_h**2).sum(axis=1) # v_h_norm = tt.sqrt((v_h**2).sum()) # d_t_h_norm = tt.sqrt((d_t_h**2).sum(axis=1)) # # denominator = v_h_norm * d_t_h_norm # # host_targets_orientation = tt.dot(d_t_h, v_h) / (denominator + 1e-3) # # in_fornt_targets = tt.nnet.sigmoid(5 * host_targets_orientation) # # close_targets = tt.sum(tt.abs_(d_t_h)) # # cost_accident = tt.mean(in_fornt_targets * close_targets) cost_accident = tt.sum( tt.nnet.relu(self.require_distance - h_t_dists)) # 3. rail divergence cost_right_rail = _dist_from_rail( x_h, self.right_rail_center, self.right_rail_radius) * turn_vec_h[0] cost_front_rail = (x_h[0] - self.lw / 2)**2 * turn_vec_h[1] cost_left_rail = _dist_from_rail( x_h, self.left_rail_center, self.left_rail_radius) * turn_vec_h[2] cost_rail = cost_right_rail + cost_left_rail + cost_front_rail return (x_h, v_h, angle, speed, t_h, x_t, v_t, a_t, t_t, cost_steer, cost_accel, cost_progress, cost_accident, cost_rail, a_h), t.scan_module.until(dist_from_goal < 0.001)
def set_network_trainer(input_data, input_mask, target_data, target_mask, num_outputs, network, updater, learning_rate, grad_max_norm=10., l2_lambda=1e-5, load_updater_params=None): # get one hot target one_hot_target_data = T.extra_ops.to_one_hot(y=T.flatten(target_data, 1), nb_class=num_outputs, dtype=floatX) # get network output data predict_data = get_output(network, deterministic=False) num_seqs = predict_data.shape[0] # get prediction cost predict_data = T.reshape(x=predict_data, newshape=(-1, num_outputs), ndim=2) predict_data = predict_data - T.max(predict_data, axis=-1, keepdims=True) predict_data = predict_data - T.log(T.sum(T.exp(predict_data), axis=-1, keepdims=True)) train_predict_cost = -T.sum(T.mul(one_hot_target_data, predict_data), axis=-1) train_predict_cost = train_predict_cost*T.flatten(target_mask, 1) train_model_cost = train_predict_cost.sum()/num_seqs train_frame_cost = train_predict_cost.sum()/target_mask.sum() # get regularizer cost train_regularizer_cost = regularize_network_params(network, penalty=l2)*l2_lambda # get network parameters network_params = get_all_params(network, trainable=True) # get network gradients network_grads = theano.grad(cost=train_model_cost + train_regularizer_cost, wrt=network_params) if grad_max_norm>0.: network_grads, network_grads_norm = total_norm_constraint(tensor_vars=network_grads, max_norm=grad_max_norm, return_norm=True) else: network_grads_norm = T.sqrt(sum(T.sum(grad**2) for grad in network_grads)) # set updater train_lr = theano.shared(lasagne.utils.floatX(learning_rate)) train_updates, trainer_params = updater(loss_or_grads=network_grads, params=network_params, learning_rate=train_lr, load_params_dict=load_updater_params) # get training (update) function training_fn = theano.function(inputs=[input_data, input_mask, target_data, target_mask], outputs=[train_frame_cost, network_grads_norm], updates=train_updates) return training_fn, trainer_params
def Hx_plain(): Hx_plain_splits = TT.grad(TT.sum( [TT.sum(g * x) for g, x in zip(constraint_grads, xs)]), wrt=params, disconnected_inputs='warn') return TT.concatenate([TT.flatten(s) for s in Hx_plain_splits])
def get_l2_regularization(self, extra_params=[]): return T.mean( T.concatenate([T.flatten(layer.W) for layer in self.layers] + extra_params)**2.)
def step(input_n, cell_previous, hid_previous, *args): # word-by-word attention mh = T.dot(input_n, self.W_h_attend) + T.dot( hid_previous, self.W_m_attend) # mh is (n_batch, 1, n_features) mh = mh.dimshuffle(0, 'x', 1) M = T.dot(encoder_hs, self.W_y_attend) + mh # (n_batch, n_time_steps, n_features) M = nonlinearities.tanh(M) # alpha is (n_batch, n_time_steps, 1) alpha = T.dot(M, self.w_attend) # now is (n_batch, n_time_steps) alpha = T.flatten(alpha, 2) # 0 after softmax is not 0, f**k, my mistake. # when i > encoder_seq_len, fill alpha_i to -np.inf # alpha = T.switch(encoder_mask, alpha, -np.inf) alpha = T.nnet.softmax(alpha) # apply encoder_mask to alpha # encoder_mask is (n_batch, n_time_steps) # when i > encoder_seq_len, alpha_i should be 0. # actually not need mask, but in case of error # alpha = alpha * encoder_mask alpha = alpha.dimshuffle(0, 1, 'x') weighted_encoder = T.sum(encoder_hs * alpha, axis=1) r = weighted_encoder # (n_batch, n_features) input_n = T.concatenate([r, input_n], axis=1) if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip(gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous * self.W_cell_to_ingate forgetgate += cell_previous * self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate * self.nonlinearity(cell) return [cell, hid]
def forward(self, inputtensor): inputimage = inputtensor[0] return (T.flatten(inputimage, outdim=2),)
def MSE(self): #self.cost = T.mean(T.sum((self.y-self.fully_connected.output)**2)) m = T.sum(T.flatten( (self.inpt - self.trunc_output)**2, outdim=2) * self.df, axis=1) return T.mean(4 * m - self.latent_layer.prior)
def __init__(self, dim_z, x_train, x_test, diff=None, magic=5000): ####################################### SETTINGS ################################### self.x_train = x_train self.x_test = x_test self.diff = diff self.batch_size = 100. self.learning_rate = theano.shared(np.float32(0.0008)) self.momentum = 0.3 self.performance = {"train": [], "test": []} self.inpt = T.ftensor4(name='input') self.df = T.fmatrix(name='differential') self.dim_z = dim_z self.generative_z = theano.shared(np.float32(np.zeros([1, dim_z]))) self.activation = relu self.generative = False self.out_distribution = False #self.y = T.matrix(name="y") self.in_filters = [64, 64, 64] self.filter_lengths = [10., 10., 10.] self.params = [] #magic = 73888. self.magic = magic self.dropout_symbolic = T.fscalar() self.dropout_prob = theano.shared(np.float32(0.0)) ####################################### LAYERS ###################################### # LAYER 1 ############################## self.conv1 = one_d_conv_layer(self.inpt, self.in_filters[0], 1, self.filter_lengths[0], param_names=["W1", 'b1']) self.params += self.conv1.params self.bn1 = batchnorm(self.conv1.output) self.nl1 = self.activation(self.bn1.X) self.maxpool1 = ds.max_pool_2d(self.nl1, [3, 1], st=[2, 1], ignore_border=False).astype( theano.config.floatX) self.layer1_out = dropout(self.maxpool1, self.dropout_symbolic) #self.layer1_out = self.maxpool1 # LAYER2 ################################ self.flattened = T.flatten(self.layer1_out, outdim=2) # Variational Layer ##################### self.latent_layer = variational_gauss_layer(self.flattened, self.magic, dim_z) self.params += self.latent_layer.params self.latent_out = self.latent_layer.output # Hidden Layer ######################### self.hidden_layer = hidden_layer(self.latent_out, dim_z, self.magic) self.params += self.hidden_layer.params self.hid_out = dropout( self.activation(self.hidden_layer.output).reshape( (self.inpt.shape[0], self.in_filters[-1], int(self.magic / self.in_filters[-1]), 1)), self.dropout_symbolic) # Devonvolutional 1 ###################### self.deconv1 = one_d_deconv_layer(self.hid_out, 1, self.in_filters[2], self.filter_lengths[2], pool=2., param_names=["W3", 'b3'], distribution=False) self.params += self.deconv1.params #self.nl_deconv1 = dropout(self.activation(self.deconv1.output),self.dropout_symbolic) self.tanh_out = self.deconv1.output self.last_layer = self.deconv1 if self.out_distribution == True: self.trunk_sigma = self.last_layer.log_sigma[:, :, :self.inpt. shape[2], :] self.trunc_output = self.tanh_out[:, :, :self.inpt.shape[2], :] ################################### FUNCTIONS ###################################################### self.get_latent_states = theano.function( [self.inpt], self.latent_out, givens=[[self.dropout_symbolic, self.dropout_prob]]) #self.prior_debug = theano.function([self.inpt],[self.latent_out,self.latent_layer.mu_encoder,self.latent_layer.log_sigma_encoder,self.latent_layer.prior]) #self.get_prior = theano.function([self.inpt],self.latent_layer.prior) #self.convolve1 = theano.function([self.inpt],self.layer1_out) #self.convolve2 = theano.function([self.inpt],self.layer2_out) self.output = theano.function( [self.inpt], self.trunc_output, givens=[[self.dropout_symbolic, self.dropout_prob]]) self.get_flattened = theano.function( [self.inpt], self.flattened, givens=[[self.dropout_symbolic, self.dropout_prob]]) #self.deconvolve1 = theano.function([self.inpt],self.deconv1.output) #self.deconvolve2 = theano.function([self.inpt],self.deconv2.output) #self.sig_out = theano.function([self.inpt],T.flatten(self.trunk_sigma,outdim=2)) self.output = theano.function( [self.inpt], self.trunc_output, givens=[[self.dropout_symbolic, self.dropout_prob]]) #self.generate_from_z = theano.function([self.inpt],self.trunc_output,givens = [[self.latent_out,self.generative_z]]) self.generate_from_z = theano.function( [self.inpt], self.trunc_output, givens=[[self.dropout_symbolic, self.dropout_prob], [self.latent_out, self.generative_z]]) self.cost = self.MSE() self.mse = self.MSE() #self.likelihood = self.log_px_z() #self.get_cost = theano.function([self.inpt],[self.cost,self.mse]) #self.get_likelihood = theano.function([self.layer1.inpt],[self.likelihood]) self.derivatives = T.grad(self.cost, self.params) #self.get_gradients = theano.function([self.inpt],self.derivatives) self.updates = adam(self.params, self.derivatives, self.learning_rate) #self.updates =momentum_update(self.params,self.derivatives,self.learning_rate,self.momentum) self.train_model = theano.function( inputs=[self.inpt, self.df], outputs=self.cost, updates=self.updates, givens=[[self.dropout_symbolic, self.dropout_prob]])
def get_parent_state(self, children_states, node_type, use_dropout: bool, iteration_number) -> tuple: w = self.__w_with_dropout if use_dropout else self.__w return T.tanh( T.dot(w[node_type], T.flatten(children_states)) + self.__bias[node_type]), 0
def __init__(self, config, testMode): self.config = config batch_size = config['batch_size'] lib_conv = config['lib_conv'] useLayers = config['useLayers'] #imgWidth = config['imgWidth'] #imgHeight = config['imgHeight'] initWeights = config['initWeights'] #if we wish to initialize alexnet with some weights. #need to make changes in layers.py to accept initilizing weights if initWeights: weightsDir = config['weightsDir'] weightFileTag = config['weightFileTag'] prob_drop = config['prob_drop'] # ##################### BUILD NETWORK ########################## x = T.ftensor4('x') mean = T.ftensor4('mean') #y = T.lvector('y') print '... building the model' self.layers = [] params = [] weight_types = [] if useLayers >= 1: convpool_layer1 = ConvPoolLayer(input=x-mean, image_shape=(3, None, None, batch_size), filter_shape=(3, 11, 11, 96), convstride=4, padsize=0, group=1, poolsize=3, poolstride=2, bias_init=0.0, lrn=True, lib_conv=lib_conv, initWeights=initWeights, weightsDir=weightsDir, weightFiles=['W_0'+weightFileTag, 'b_0'+weightFileTag] ) self.layers.append(convpool_layer1) params += convpool_layer1.params weight_types += convpool_layer1.weight_type if useLayers >= 2: convpool_layer2 = ConvPoolLayer(input=convpool_layer1.output, image_shape=(96, None, None, batch_size), #change from 27 to appropriate value sbased on conv1's output filter_shape=(96, 5, 5, 256), convstride=1, padsize=2, group=2, poolsize=3, poolstride=2, bias_init=0.1, lrn=True, lib_conv=lib_conv, initWeights=initWeights, weightsDir=weightsDir, weightFiles=['W0_1'+weightFileTag, 'W1_1'+weightFileTag, 'b0_1'+weightFileTag, 'b1_1'+weightFileTag] ) self.layers.append(convpool_layer2) params += convpool_layer2.params weight_types += convpool_layer2.weight_type if useLayers >= 3: convpool_layer3 = ConvPoolLayer(input=convpool_layer2.output, image_shape=(256, None, None, batch_size), filter_shape=(256, 3, 3, 384), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, bias_init=0.0, lrn=False, lib_conv=lib_conv, initWeights=initWeights, weightsDir=weightsDir, weightFiles=['W_2'+weightFileTag, 'b_2'+weightFileTag] ) self.layers.append(convpool_layer3) params += convpool_layer3.params weight_types += convpool_layer3.weight_type if useLayers >= 4: convpool_layer4 = ConvPoolLayer(input=convpool_layer3.output, image_shape=(384, None, None, batch_size), filter_shape=(384, 3, 3, 384), convstride=1, padsize=1, group=2, poolsize=1, poolstride=0, bias_init=0.1, lrn=False, lib_conv=lib_conv, initWeights=initWeights, weightsDir=weightsDir, weightFiles=['W0_3'+weightFileTag, 'W1_3'+weightFileTag, 'b0_3'+weightFileTag, 'b1_3'+weightFileTag] ) self.layers.append(convpool_layer4) params += convpool_layer4.params weight_types += convpool_layer4.weight_type if useLayers >= 5: convpool_layer5 = ConvPoolLayer(input=convpool_layer4.output, image_shape=(384, None, None, batch_size), filter_shape=(384, 3, 3, 256), convstride=1, padsize=1, group=2, poolsize=3, poolstride=2, bias_init=0.0, lrn=False, lib_conv=lib_conv, initWeights=initWeights, weightsDir=weightsDir, weightFiles=['W0_4'+weightFileTag, 'W1_4'+weightFileTag, 'b0_4'+weightFileTag, 'b1_4'+weightFileTag] ) self.layers.append(convpool_layer5) params += convpool_layer5.params weight_types += convpool_layer5.weight_type if useLayers >= 6: fc_layer6_input = T.flatten(convpool_layer5.output.dimshuffle(3, 0, 1, 2), 2) fc_layer6 = FCLayer(input=fc_layer6_input, n_in=9216, n_out=4096, initWeights=initWeights, weightsDir=weightsDir, weightFiles=['W_5'+weightFileTag, 'b_5'+weightFileTag]) self.layers.append(fc_layer6) params += fc_layer6.params weight_types += fc_layer6.weight_type if testMode: dropout_layer6 = fc_layer6 else: dropout_layer6 = DropoutLayer(fc_layer6.output, n_in=4096, n_out=4096, prob_drop=prob_drop) if useLayers >= 7: fc_layer7 = FCLayer(input=dropout_layer6.output, n_in=4096, n_out=4096, initWeights=initWeights, weightsDir=weightsDir, weightFiles=['W_6'+weightFileTag, 'b_6'+weightFileTag]) self.layers.append(fc_layer7) params += fc_layer7.params weight_types += fc_layer7.weight_type if testMode: dropout_layer6 = fc_layer7 else: dropout_layer7 = DropoutLayer(fc_layer7.output, n_in=4096, n_out=4096, prob_drop=prob_drop) if useLayers >= 8: softmax_layer8 = SoftmaxLayer(input=dropout_layer7.output, n_in=4096, n_out=1000, initWeights=initWeights, weightsDir=weightsDir, weightFiles=['W_7'+weightFileTag, 'b_7'+weightFileTag]) self.layers.append(softmax_layer8) params += softmax_layer8.params weight_types += softmax_layer8.weight_type # #################### NETWORK BUILT ####################### self.output = self.layers[useLayers-1] self.params = params self.x = x self.mean = mean self.weight_types = weight_types self.batch_size = batch_size self.useLayers = useLayers self.outLayer = self.layers[useLayers-1] meanVal = np.load(config['mean_file']) meanVal = meanVal[:, :, :, np.newaxis].astype('float32') #x is 4d, with 'batch' number of images. meanVal has only '1' in the 'batch' dimension. subtraction wont work. meanVal = np.tile(meanVal,(1,1,1,batch_size)) self.meanVal = meanVal #meanVal = np.zeros([3,imgHeight,imgWidth,2], dtype='float32') if useLayers >= 8: #if last layer is softmax, then its output is y_pred finalOut = self.outLayer.y_pred else: finalOut = self.outLayer.output self.forwardFunction = theano.function([self.x, In(self.mean, value=meanVal)], [finalOut])
def flatten(x): return T.flatten(x)
def convolve(kerns, kshp, nkern, images, imgshp, step=(1, 1), bias=None, mode='valid', flatten=True): """Convolution implementation by sparse matrix multiplication. :note: For best speed, put the matrix which you expect to be smaller as the 'kernel' argument "images" is assumed to be a matrix of shape batch_size x img_size, where the second dimension represents each image in raster order If flatten is "False", the output feature map will have shape: .. code-block:: python batch_size x number of kernels x output_size If flatten is "True", the output feature map will have shape: .. code-block:: python batch_size x number of kernels * output_size .. note:: IMPORTANT: note that this means that each feature map (image generate by each kernel) is contiguous in memory. The memory layout will therefore be: [ <feature_map_0> <feature_map_1> ... <feature_map_n>], where <feature_map> represents a "feature map" in raster order kerns is a 2D tensor of shape nkern x N.prod(kshp) :param kerns: 2D tensor containing kernels which are applied at every pixel :param kshp: tuple containing actual dimensions of kernel (not symbolic) :param nkern: number of kernels/filters to apply. nkern=1 will apply one common filter to all input pixels :param images: tensor containing images on which to apply convolution :param imgshp: tuple containing image dimensions :param step: determines number of pixels between adjacent receptive fields (tuple containing dx,dy values) :param mode: 'full', 'valid' see CSM.evaluate function for details :param sumdims: dimensions over which to sum for the tensordot operation. By default ((2,),(1,)) assumes kerns is a nkern x kernsize matrix and images is a batchsize x imgsize matrix containing flattened images in raster order :param flatten: flatten the last 2 dimensions of the output. By default, instead of generating a batchsize x outsize x nkern tensor, will flatten to batchsize x outsize*nkern :return: out1, symbolic result :return: out2, logical shape of the output img (nkern,heigt,width) :TODO: test for 1D and think of how to do n-d convolutions """ # start by computing output dimensions, size, etc kern_size = np.int64(np.prod(kshp)) # inshp contains either 2 entries (height,width) or 3 (nfeatures,h,w) # in the first case, default nfeatures to 1 if np.size(imgshp) == 2: imgshp = (1, ) + imgshp # construct indices and index pointers for sparse matrix, which, # when multiplied with input images will generate a stack of image # patches indices, indptr, spmat_shape, sptype, outshp = \ convolution_indices.conv_eval(imgshp, kshp, step, mode) # build sparse matrix, then generate stack of image patches csc = theano.sparse.CSM(sptype)(np.ones(indices.size), indices, indptr, spmat_shape) patches = (sparse.structured_dot(csc, images.T)).T # compute output of linear classifier pshape = tensor.stack([images.shape[0] * tensor.as_tensor(np.prod(outshp)),\ tensor.as_tensor(imgshp[0] * kern_size)]) patch_stack = tensor.reshape(patches, pshape, ndim=2) # kern is of shape: nkern x ksize*number_of_input_features # output is thus of shape: bsize*outshp x nkern output = tensor.dot(patch_stack, kerns.T) # add bias across each feature map (more efficient to do it now) if bias is not None: output += bias # now to have feature maps in raster order ... # go from bsize*outshp x nkern to bsize x nkern*outshp newshp = tensor.stack([images.shape[0],\ tensor.as_tensor(np.prod(outshp)),\ tensor.as_tensor(nkern)]) tensout = tensor.reshape(output, newshp, ndim=3) output = tensor.DimShuffle((False, ) * tensout.ndim, (0, 2, 1))(tensout) if flatten: output = tensor.flatten(output, 2) return output, np.hstack((nkern, outshp))
def build_objective(model, deterministic=False, epsilon=1.e-7): predictions = T.flatten( nn.layers.get_output(model.l_out, deterministic=deterministic)) targets = T.flatten(nn.layers.get_output(model.l_target)) preds = T.clip(predictions, epsilon, 1. - epsilon) return T.mean(nn.objectives.binary_crossentropy(preds, targets))
def op(self, state): X = self.l_in.op(state=state) return T.flatten(X, outdim=self.axes)
def build_computation_graph(self): ###################### BUILD NETWORK ########################## # whether or not to mirror the input images before feeding them into the network if self.flag_datalayer: layer_1_input = mirror_images( input=self.x, image_shape=( self.batch_size, 3, 256, 256, ), # bc01 format cropsize=227, rand=self.rand, flag_rand=self.rand_crop) else: layer_1_input = self.x # 4D tensor (going to be in c01b format) # Start with 5 convolutional pooling layers log.debug("convpool layer 1...") convpool_layer1 = ConvPoolLayer(inputs_hook=((self.batch_size, 3, 227, 227), layer_1_input), filter_shape=(96, 3, 11, 11), convstride=4, padsize=0, group=1, poolsize=3, poolstride=2, bias_init=0.0, local_response_normalization=True) # Add this layer's parameters! self.params += convpool_layer1.get_params() log.debug("convpool layer 2...") convpool_layer2 = ConvPoolLayer(inputs_hook=(( self.batch_size, 96, 27, 27, ), convpool_layer1.get_outputs()), filter_shape=(256, 96, 5, 5), convstride=1, padsize=2, group=2, poolsize=3, poolstride=2, bias_init=0.1, local_response_normalization=True) # Add this layer's parameters! self.params += convpool_layer2.get_params() log.debug("convpool layer 3...") convpool_layer3 = ConvPoolLayer( inputs_hook=((self.batch_size, 256, 13, 13), convpool_layer2.get_outputs()), filter_shape=(384, 256, 3, 3), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, bias_init=0.0, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer3.get_params() log.debug("convpool layer 4...") convpool_layer4 = ConvPoolLayer( inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer3.get_outputs()), filter_shape=(384, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=1, poolstride=0, bias_init=0.1, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer4.get_params() log.debug("convpool layer 5...") convpool_layer5 = ConvPoolLayer( inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer4.get_outputs()), filter_shape=(256, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=3, poolstride=2, bias_init=0.0, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer5.get_params() # Now onto the fully-connected layers! fc_config = { 'activation': 'rectifier', # type of activation function to use for output 'weights_init': 'gaussian', # either 'gaussian' or 'uniform' - how to initialize weights 'weights_mean': 0.0, # mean for gaussian weights init 'weights_std': 0.005, # standard deviation for gaussian weights init 'bias_init': 0.0 # how to initialize the bias parameter } log.debug("fully connected layer 1 (model layer 6)...") # we want to have dropout applied to the training version, but not the test version. fc_layer6_input = T.flatten(convpool_layer5.get_outputs(), 2) fc_layer6 = BasicLayer(inputs_hook=(9216, fc_layer6_input), output_size=4096, config=fc_config) # Add this layer's parameters! self.params += fc_layer6.get_params() # now apply dropout to the output for training dropout_layer6 = dropout(fc_layer6.get_outputs(), corruption_level=0.5) log.debug("fully connected layer 2 (model layer 7)...") fc_layer7 = BasicLayer(inputs_hook=(4096, fc_layer6.get_outputs()), output_size=4096, config=fc_config) fc_layer7_train = BasicLayer(inputs_hook=(4096, dropout_layer6), output_size=4096, params_hook=fc_layer7.get_params(), config=fc_config) # Add this layer's parameters! self.params += fc_layer7_train.get_params() # apply dropout again for training dropout_layer7 = dropout(fc_layer7_train.get_outputs(), corruption_level=0.5) # last layer is a softmax prediction output layer softmax_config = { 'weights_init': 'gaussian', 'weights_mean': 0.0, 'weights_std': 0.005, 'bias_init': 0.0 } log.debug("softmax classification layer (model layer 8)...") softmax_layer8 = SoftmaxLayer(inputs_hook=(4096, fc_layer7.get_outputs()), output_size=1000, config=softmax_config) softmax_layer8_train = SoftmaxLayer( inputs_hook=(4096, dropout_layer7), output_size=1000, params_hook=softmax_layer8.get_params(), config=softmax_config) # Add this layer's parameters! self.params += softmax_layer8.get_params() # finally the softmax output from the whole thing! self.output = softmax_layer8.get_outputs() ##################### # Cost and monitors # ##################### self.train_cost = softmax_layer8_train.negative_log_likelihood(self.y) cost = softmax_layer8.negative_log_likelihood(self.y) errors = softmax_layer8.errors(self.y) train_errors = softmax_layer8_train.errors(self.y) self.monitors = OrderedDict([('cost', cost), ('errors', errors), ('dropout_errors', train_errors)]) ######################### # Compile the functions # ######################### log.debug("Compiling functions!") t = time.time() log.debug("f_predict...") # use the actual argmax from the classification self.f_predict = function( inputs=[self.x], outputs=softmax_layer8.get_argmax_prediction()) log.debug("f_monitors") self.f_monitors = function(inputs=[self.x, self.y], outputs=self.monitors.values()) log.debug("compilation took %s" % make_time_units_string(time.time() - t))
def gradient1(f, v): """flat gradient of f wrt v""" return tt.flatten(tt.grad(f, v, disconnected_inputs="warn"))
def get_output_for(self, inputs, **kwargs): """ Have to re-write LSTMLayer's output construction because we need cell_out, which is not stored in the original """ # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None cell_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if self.cell_init_incoming_index > 0: cell_init = inputs[self.cell_init_incoming_index] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate([ self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate ], axis=1) # Same for hidden weight matrices W_hid_stacked = T.concatenate([ self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate ], axis=1) # Stack biases into a (4*num_units) vector b_stacked = T.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=0) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # W_in_stacked is (n_features, 4*num_units). input is then # (n_time_steps, n_batch, 4*num_units). input = T.dot(input, W_in_stacked) + b_stacked # At each call to scan, input_n will be (n_time_steps, 4*num_units). # We define a slicing function that extract the input to each LSTM gate def slice_w(x, n): return x[:, n * self.num_units:(n + 1) * self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(input_n, cell_previous, hid_previous, *args): if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip(gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous * self.W_cell_to_ingate forgetgate += cell_previous * self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate * self.nonlinearity(cell) return [cell, hid] def step_masked(input_n, mask_n, cell_previous, hid_previous, *args): cell, hid = step(input_n, cell_previous, hid_previous, *args) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. not_mask = 1 - mask_n cell = cell * mask_n + cell_previous * not_mask hid = hid * mask_n + hid_previous * not_mask return [cell, hid] if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step ones = T.ones((num_batch, 1)) if isinstance(self.cell_init, Layer): pass elif isinstance(self.cell_init, T.TensorVariable): cell_init = self.cell_init else: # Dot against a 1s vector to repeat to shape (num_batch, num_units) cell_init = T.dot(ones, self.cell_init) if isinstance(self.hid_init, Layer): pass elif isinstance(self.hid_init, T.TensorVariable): hid_init = self.hid_init else: # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(ones, self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] # The "peephole" weight matrices are only used when self.peepholes=True if self.peepholes: non_seqs += [ self.W_cell_to_ingate, self.W_cell_to_forgetgate, self.W_cell_to_outgate ] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [W_in_stacked, b_stacked] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan cell_out, hid_out = unroll_scan(fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1]) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out = theano.scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] cell_out = cell_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) cell_out = cell_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] cell_out = cell_out[:, ::-1] return T.concatenate([cell_out, hid_out], axis=2)
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. When the hidden state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. When the cell state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. When both the cell state and the hidden state are being pre-filled `inputs[-2]` is the hidden state, while `inputs[-1]` is the cell state. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None cell_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if self.cell_init_incoming_index > 0: cell_init = inputs[self.cell_init_incoming_index] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape # Same for hidden weight matrices W_hid_stacked = T.concatenate( [self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate], axis=1) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # W_in_stacked is (n_features, 4*num_units). input is then # (n_time_steps, n_batch, 4*num_units). # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate( [self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate], axis=1) if not self.batch_norm: # Stack biases into a (4*num_units) vector b_stacked = T.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=0) input = T.dot(input, W_in_stacked) + b_stacked else: input = self.bn.get_output_for(T.dot(input, W_in_stacked), mask, **kwargs) else: # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate( [self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate], axis=1) # Stack biases into a (4*num_units) vector b_stacked = T.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=0) # At each call to scan, input_n will be (n_time_steps, 4*num_units). # We define a slicing function that extract the input to each LSTM gate def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(input_n, cell_previous, hid_previous, *args): if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip( gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous*self.W_cell_to_ingate forgetgate += cell_previous*self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate*cell_previous + ingate*cell_input if self.peepholes: outgate += cell*self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate*self.nonlinearity(cell) return [cell, hid] def step_masked(input_n, mask_n, cell_previous, hid_previous, *args): cell, hid = step(input_n, cell_previous, hid_previous, *args) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. cell = T.switch(mask_n, cell, cell_previous) hid = T.switch(mask_n, hid, hid_previous) return [cell, hid] if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step ones = T.ones((num_batch, 1)) if not isinstance(self.cell_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) cell_init = T.dot(ones, self.cell_init) if not isinstance(self.hid_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(ones, self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] # The "peephole" weight matrices are only used when self.peepholes=True if self.peepholes: non_seqs += [self.W_cell_to_ingate, self.W_cell_to_forgetgate, self.W_cell_to_outgate] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [W_in_stacked, b_stacked] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan cell_out, hid_out = unroll_scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1]) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out = theano.scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
def get_output_for(self, inputs, deterministic=False, **kwargs): if not self.stochastic and not deterministic: deterministic = True print "deterministic mode: ", deterministic def apply_regularization(weights, hid=False): current_w0 = self.w0 if hid: current_w0 = self.w0_hid if self.mean_substraction_rounding: return weights elif self.mode == 'ternary': return ternarize_weights(weights, w0=current_w0, deterministic=deterministic, srng=self.srng) elif self.mode == "binary": return binarize_weights(weights, 1., self.srng, deterministic=deterministic) elif self.mode == "dual-copy": return quantize_weights(weights, srng=self.srng, deterministic=deterministic) else: return weights if self.round_input_weights: self.Wb_in_to_hid = apply_regularization(self.W_in_to_hid) if self.round_hid: self.Wb_hid_to_hid = apply_regularization(self.W_hid_to_hid) if self.round_bias: self.bb = apply_regularization(self.b) if self.round_input_weights: Wr_in_to_hid = self.W_in_to_hid if self.round_hid: Wr_hid_to_hid = self.W_hid_to_hid if self.round_bias: br = self.b if self.round_input_weights: self.W_in_to_hid = self.Wb_in_to_hid if self.round_hid: self.W_hid_to_hid = self.Wb_hid_to_hid if self.round_bias: self.b = self.bb input = inputs[0] if self.batch_norm: input = self.bn.get_output_for(input, deterministic=deterministic, **kwargs) if len(inputs) > 1: new_inputs = [input, inputs[1]] else: new_inputs = [input] else: new_inputs = inputs inputs = new_inputs input = inputs[0] mask = None hid_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if input.ndim > 3: input = T.flatten(input, 3) input = input.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape W_in_stacked = T.concatenate([self.W_in_to_hid], axis=1) W_hid_stacked = T.concatenate([self.W_hid_to_hid], axis=1) b_stacked = T.concatenate([self.b], axis=0) if self.precompute_input: input = T.dot(input, W_in_stacked) + b_stacked def step(input_n, hid_previous, *args): hid_input = T.dot(hid_previous, W_hid_stacked) if self.grad_clipping: input_n = theano.gradient.grad_clip(input_n, -self.grad_clipping, self.grad_clipping) hid_input = theano.gradient.grad_clip(hid_input, -self.grad_clipping, self.grad_clipping) if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) + b_stacked hid = self.nonlinearity(hid_input + input_n) return hid def step_masked(input_n, mask_n, hid_previous, *args): hid = step(input_n, hid_previous, *args) hid = T.switch(mask_n, hid, hid_previous) return hid if mask is not None: mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = [input] step_fun = step if not isinstance(self.hid_init, lasagne.layers.Layer): hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init) non_seqs = [W_hid_stacked] if not self.precompute_input: non_seqs += [W_in_stacked, b_stacked] if self.unroll_scan: input_shape = self.input_shapes[0] hid_out = lasagne.utils.unroll_scan(fn=step_fun, sequences=sequences, outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1])[0] else: hid_out = theano.scan(fn=step_fun, sequences=sequences, go_backwards=self.backwards, outputs_info=[hid_init], non_sequences=non_seqs, truncate_gradient=self.gradient_steps, strict=True)[0] if self.only_return_final: hid_out = hid_out[-1] else: hid_out = hid_out.dimshuffle(1, 0, 2) if self.backwards: hid_out = hid_out[:, ::-1] if self.round_input_weights: self.W_in_to_hid = Wr_in_to_hid if self.round_hid: self.W_hid_to_hid = Wr_hid_to_hid if self.round_bias: self.b = br return hid_out
def conv2d( input, filters, image_shape=None, filter_shape=None, border_mode="valid", subsample=(1, 1), **kargs, ): """ signal.conv.conv2d performs a basic 2D convolution of the input with the given filters. The input parameter can be a single 2D image or a 3D tensor, containing a set of images. Similarly, filters can be a single 2D filter or a 3D tensor, corresponding to a set of 2D filters. Shape parameters are optional and will result in faster execution. Parameters ---------- input : Symbolic theano tensor for images to be filtered. Dimensions: ([num_images], image height, image width) filters : Symbolic theano tensor for convolution filter(s). Dimensions: ([num_filters], filter height, filter width) border_mode: {'valid', 'full'} See scipy.signal.convolve2d. subsample Factor by which to subsample output. image_shape : tuple of length 2 or 3 ([num_images,] image height, image width). filter_shape : tuple of length 2 or 3 ([num_filters,] filter height, filter width). kwargs See theano.tensor.nnet.conv.conv2d. Returns ------- symbolic 2D,3D or 4D tensor Tensor of filtered images, with shape ([number images,] [number filters,] image height, image width). """ assert input.ndim in (2, 3) assert filters.ndim in (2, 3) # use shape information if it is given to us ### if filter_shape and image_shape: if input.ndim == 3: bsize = image_shape[0] else: bsize = 1 imshp = (1, ) + tuple(image_shape[-2:]) if filters.ndim == 3: nkern = filter_shape[0] else: nkern = 1 kshp = filter_shape[-2:] else: nkern, kshp = None, None bsize, imshp = None, None # reshape tensors to 4D, for compatibility with ConvOp ### if input.ndim == 3: sym_bsize = input.shape[0] else: sym_bsize = 1 if filters.ndim == 3: sym_nkern = filters.shape[0] else: sym_nkern = 1 new_input_shape = tensor.join(0, tensor.stack([sym_bsize, 1]), input.shape[-2:]) input4D = tensor.reshape(input, new_input_shape, ndim=4) new_filter_shape = tensor.join(0, tensor.stack([sym_nkern, 1]), filters.shape[-2:]) filters4D = tensor.reshape(filters, new_filter_shape, ndim=4) # perform actual convolution ### op = conv.ConvOp( output_mode=border_mode, dx=subsample[0], dy=subsample[1], imshp=imshp, kshp=kshp, nkern=nkern, bsize=bsize, **kargs, ) output = op(input4D, filters4D) # flatten to 3D tensor if convolving with single filter or single image if input.ndim == 2 and filters.ndim == 2: if theano.config.warn.signal_conv2d_interface: warnings.warn( "theano.tensor.signal.conv2d() now outputs a 2d tensor when both" " inputs are 2d. To disable this warning, set the Theano flag" " warn.signal_conv2d_interface to False", stacklevel=3, ) output = tensor.flatten(output.T, ndim=2).T elif input.ndim == 2 or filters.ndim == 2: output = tensor.flatten(output.T, ndim=3).T return output
import numpy as np import keras.backend as K # A test script to validate causal dilated convolutions dilation = 2 input = T.fvector() filters = T.fvector( ) # (output channels, input channels, filter rows, filter columns). input_reshaped = T.reshape(input, (1, -1, 1)) input_reshaped = K.temporal_pre_padding(input_reshaped, padding=dilation) input_reshaped = T.reshape(input_reshaped, (1, 1, -1, 1)) filters_reshaped = T.reshape(filters, (1, 1, -1, 1)) out = T.nnet.conv2d(input_reshaped, filters_reshaped, border_mode='valid', filter_dilation=(dilation, 1)) out = T.reshape(out, (1, -1, 1)) out = K.temporal_pre_padding(out, padding=dilation) out = T.reshape(out, (1, 1, -1, 1)) out = T.nnet.conv2d(out, filters_reshaped, border_mode='valid', filter_dilation=(dilation, 1)) out = T.flatten(out) in_input = np.arange(8, dtype='float32') in_filters = np.array([1, 1], dtype='float32') f = theano.function([input, filters], out) print "".join(["%3.0f" % i for i in in_input]) print "".join(["%3.0f" % i for i in f(in_input, in_filters)])
image_shape=x_shp, filter_shape=w_fb.shape, border_mode='valid') s_P_sum = theano.shared(w_fb.sum(3).sum(2).sum(1)) Pmmm = p_mean * s_P_sum.dimshuffle(0, 'x', 'x') s_PM = theano.shared((w_means * w_fb).sum(3).sum(2).sum(1)) z = p_scale * (Px - Pmmm) - s_PM.dimshuffle(0, 'x', 'x') assert z.dtype == x.dtype, (z.dtype, x.dtype) return z, (_shp[0], kN, _shp[2], _shp[3]) @pyll.scope.define def slm_flatten((x, x_shp), ): r = tensor.flatten(x, 2) r_shp = x_shp[0], np.prod(x_shp[1:]) return r, r_shp @pyll.scope.define_info(o_len=2) def slm_lpool_smallgrid((x, x_shp), grid_res=2, order=1): """ Like lpool, but parametrized to produce a fixed size image as output. The image is not rescaled, but rather single giant box filters are defined for each output pixel, and stored in a matrix. """ assert x.dtype == 'float32' order = float(order) if hasattr(order, '__iter__'):
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. When the hidden state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. When the cell state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. When both the cell state and the hidden state are being pre-filled `inputs[-2]` is the hidden state, while `inputs[-1]` is the cell state. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None cell_init = None encoder_hs = None encoder_mask = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if self.encoder_mask_incoming_index > 0: # (n_batch, n_time_steps) encoder_mask = inputs[self.encoder_mask_incoming_index] encoder_mask = encoder_mask.astype('float32') cell_init = inputs[self.cell_init_incoming_index] if self.attention: # (n_batch, n_time_steps, n_features) encoder_hs = cell_init[0] # encoder_mask is # (n_batch, n_time_steps, 1) encoder_hs = encoder_hs * encoder_mask.dimshuffle(0, 1, 'x') cell_init = cell_init[1] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate([ self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate ], axis=1) # Same for hidden weight matrices W_hid_stacked = T.concatenate([ self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate ], axis=1) # Stack biases into a (4*num_units) vector b_stacked = T.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=0) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # W_in_stacked is (n_features, 4*num_units). input is then # (n_time_steps, n_batch, 4*num_units). input = T.dot(input, W_in_stacked) + b_stacked # At each call to scan, input_n will be (n_time_steps, 4*num_units). # We define a slicing function that extract the input to each LSTM gate def slice_w(x, n): return x[:, n * self.num_units:(n + 1) * self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(input_n, cell_previous, hid_previous, previous_r, *args): if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip(gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous * self.W_cell_to_ingate forgetgate += cell_previous * self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate * self.nonlinearity(cell) r = previous_r if self.attention and self.word_by_word: mh = T.dot(hid, self.W_h_attend) + T.dot( previous_r, self.W_r_attend) # mh is (n_batch, 1, n_features) mh = mh.dimshuffle(0, 'x', 1) M = T.dot(encoder_hs, self.W_y_attend) + mh # (n_batch, n_time_steps, n_features) M = nonlinearities.tanh(M) # alpha is (n_batch, n_time_steps, 1) alpha = T.dot(M, self.w_attend) # now is (n_batch, n_time_steps) alpha = T.flatten(alpha, 2) # 0 after softmax is not 0, f**k, my mistake. # when i > encoder_seq_len, fill alpha_i to -np.inf # alpha = T.switch(encoder_mask, alpha, -np.inf) alpha = T.nnet.softmax(alpha) # apply encoder_mask to alpha # encoder_mask is (n_batch, n_time_steps) # when i > encoder_seq_len, alpha_i should be 0. # actually not need mask, but in case of error # alpha = alpha * encoder_mask alpha = alpha.dimshuffle(0, 1, 'x') weighted_encoder = T.sum(encoder_hs * alpha, axis=1) r = weighted_encoder + nonlinearities.tanh( T.dot(previous_r, self.W_t_attend)) return [cell, hid, r] def step_masked(input_n, mask_n, cell_previous, hid_previous, previous_r, *args): cell, hid, r = step(input_n, cell_previous, hid_previous, previous_r, *args) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. cell = T.switch(mask_n, cell, cell_previous) hid = T.switch(mask_n, hid, hid_previous) r = T.switch(mask_n, r, previous_r) return [cell, hid, r] if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step ones = T.ones((num_batch, 1)) if not isinstance(self.hid_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(ones, self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] # The "peephole" weight matrices are only used when self.peepholes=True if self.peepholes: non_seqs += [ self.W_cell_to_ingate, self.W_cell_to_forgetgate, self.W_cell_to_outgate ] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [W_in_stacked, b_stacked] r_init = T.dot(ones, self.r_init) if self.attention and self.word_by_word: non_seqs += [ self.W_y_attend, self.W_h_attend, self.W_r_attend, self.w_attend, self.W_t_attend, encoder_hs, # encoder_mask ] # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out, r_out = theano.scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init, r_init], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, strict=True)[0] # (n_batch, n_features) hid_N = hid_out[-1] out = hid_N if self.attention: if self.word_by_word: r_N = r_out[-1] else: mh = T.dot(hid_N, self.W_h_attend) mh = mh.dimshuffle(0, 'x', 1) M = T.dot(encoder_hs, self.W_y_attend) + mh # (n_batch, n_time_steps, n_features) M = nonlinearities.tanh(M) alpha = T.dot(M, self.w_attend) # (n_batch, n_time_steps) alpha = T.flatten(alpha, 2) # when i > encoder_seq_len, fill alpha_i to -np.inf # alpha = T.switch(encoder_mask, alpha, -np.inf) alpha = T.nnet.softmax(alpha) # apply encoder_mask to alpha # encoder_mask is (n_batch, n_time_steps) # when i > encoder_seq_len, alpha_i should be 0. # actually not need mask, but in case of error # alpha = alpha * encoder_mask alpha = alpha.dimshuffle(0, 1, 'x') # (n_batch, n_features) r_N = T.sum(encoder_hs * alpha, axis=1) out = nonlinearities.tanh( T.dot(r_N, self.W_p_attend) + T.dot(hid_N, self.W_x_attend)) return out
all_params = nn.layers.get_all_params(l_out) if config.one_hot: all_params = all_params[1:] all_layers = nn.layers.get_all_layers(l_out) num_params = nn.layers.count_params(l_out) print(' number of parameters: %d' % num_params) print(' layer output shapes:') print('#params:') print('output shape:') for layer in all_layers: name = layer.__class__.__name__ num_param = sum([np.prod(p.get_value().shape) for p in layer.get_params()]) num_param = num_param.__str__() print(' %s %s %s' % (name, num_param, layer.output_shape)) y = T.cast(T.flatten(x[:, 1:]), 'int32') # training loss p1 = T.reshape(T.log(predictions[T.arange(y.shape[0]), y]), mask.shape) loss = -1. * T.mean(T.sum(mask * p1, axis=1), axis=0) # validation loss (with disabled dropout) p1_det = T.reshape(T.log(predictions_det[T.arange(y.shape[0]), y]), mask.shape) loss_det = -1. * T.mean(T.sum(mask * p1_det, axis=1), axis=0) learning_rate = theano.shared(np.float32(config.learning_rate)) grads = theano.grad(loss, all_params) updates = nn.updates.rmsprop(grads, all_params, config.learning_rate) train = theano.function([x, mask], loss, updates=updates) validate = theano.function([x, mask], loss_det)
def main(data_sets, W_embed): # Optimization learning rate LEARNING_RATE = theano.shared(np.array(0.001, dtype=theano.config.floatX)) eta_decay = np.array(0.5, dtype=theano.config.floatX) # Min/max sequence length MAX_LENGTH = 300 X_raw_data, Y_raw_data = data_sets.get_data_from_type("train") trainingAdmiSeqs, trainingMask, trainingLabels, trainingLengths, ltr = prepare_data( X_raw_data, Y_raw_data, vocabsize=619, maxlen=MAX_LENGTH) Num_Samples, MAX_LENGTH, N_VOCAB = trainingAdmiSeqs.shape X_valid_data, Y_valid_data = data_sets.get_data_from_type("valid") validAdmiSeqs, validMask, validLabels, validLengths, lval = prepare_data( X_valid_data, Y_valid_data, vocabsize=619, maxlen=MAX_LENGTH) X_test_data, Y_test_data = data_sets.get_data_from_type("test") test_admiSeqs, test_mask, test_labels, testLengths, ltes = prepare_data( X_test_data, Y_test_data, vocabsize=619, maxlen=MAX_LENGTH) alllength = sum(trainingLengths) + sum(validLengths) + sum(testLengths) print(alllength) eventNum = sum(ltr) + sum(lval) + sum(ltes) print(eventNum) print("Building network ...") N_BATCH = 1 # First, we build the network, starting with an input layer # Recurrent layers expect input of shape # (batch size, max sequence length, number of features) l_in = lasagne.layers.InputLayer(shape=(N_BATCH, MAX_LENGTH, N_VOCAB)) #l_label = lasagne.layers.InputLayer(shape=(N_BATCH, MAX_LENGTH, 1)) # The network also needs a way to provide a mask for each sequence. We'll # use a separate input layer for that. Since the mask only determines # which indices are part of the sequence for each batch entry, they are # supplied as matrices of dimensionality (N_BATCH, MAX_LENGTH) l_mask = lasagne.layers.InputLayer(shape=(N_BATCH, MAX_LENGTH)) embedsize = 100 n_topics = 50 #l_embed = lasagne.layers.DenseLayer(l_in, num_units=embedsize, b=None, W = W_embed, num_leading_axes=2) l_embed = lasagne.layers.DenseLayer(l_in, num_units=embedsize, b=None, num_leading_axes=2) #l_embed.params[l_embed.W].remove("trainable") #l_drop = lasagne.layers.dropout(l_embed) l_forward0 = lasagne.layers.GRULayer(l_embed, N_HIDDEN, mask_input=l_mask, grad_clipping=GRAD_CLIP, only_return_final=False) l_forward = MaskingLayer([l_forward0, l_mask]) l_1 = lasagne.layers.DenseLayer( l_in, num_units=N_HIDDEN, nonlinearity=lasagne.nonlinearities.rectify, num_leading_axes=2) l_2 = lasagne.layers.DenseLayer( l_1, num_units=N_HIDDEN, nonlinearity=lasagne.nonlinearities.rectify, num_leading_axes=2) mu = lasagne.layers.DenseLayer(l_2, num_units=n_topics, nonlinearity=None, num_leading_axes=1) # batchsize * n_topic log_sigma = lasagne.layers.DenseLayer( l_2, num_units=n_topics, nonlinearity=None, num_leading_axes=1) # batchsize * n_topic l_theta = ThetaLayer([mu, log_sigma], maxlen=MAX_LENGTH) #batchsize * maxlen * n_topic l_B = lasagne.layers.DenseLayer(l_in, b=None, num_units=n_topics, nonlinearity=None, num_leading_axes=2) l_context = lasagne.layers.ElemwiseMergeLayer([l_B, l_theta], T.mul) l_context = lasagne.layers.ExpressionLayer(l_context, lambda X: X.mean(-1), output_shape="auto") l_dense0 = lasagne.layers.DenseLayer(l_forward, num_units=1, nonlinearity=None, num_leading_axes=2) l_dense1 = lasagne.layers.reshape(l_dense0, ([0], [1])) #batchsize * maxlen l_dense = lasagne.layers.ElemwiseMergeLayer([l_dense1, l_context], T.add) l_out0 = lasagne.layers.NonlinearityLayer( l_dense, nonlinearity=lasagne.nonlinearities.sigmoid) l_out = lasagne.layers.ExpressionLayer( lasagne.layers.ElemwiseMergeLayer([l_out0, l_mask], T.mul), lambda X: X + 0.000001) target_values = T.matrix('target_output') target_values_flat = T.flatten(target_values) # lasagne.layers.get_output produces a variable for the output of the net network_output = lasagne.layers.get_output(l_out) # The network output will have shape (n_batch, maxlen); let's flatten to get a # 1-dimensional vector of predicted values predicted_values = network_output.flatten() # Our cost will be mean-squared error cost = lasagne.objectives.binary_crossentropy(predicted_values, target_values_flat) kl_term = l_theta.klterm cost = cost.sum() + kl_term test_output = lasagne.layers.get_output(l_out, deterministic=True) #cost = T.mean((predicted_values - target_values)**2) # Retrieve all parameters from the network all_params = lasagne.layers.get_all_params(l_out) # Compute SGD updates for training print("Computing updates ...") updates = lasagne.updates.adam(cost, all_params, LEARNING_RATE) # Theano functions for training and computing cost print("Compiling functions ...") train = theano.function([l_in.input_var, target_values, l_mask.input_var], cost, updates=updates) compute_cost = theano.function( [l_in.input_var, target_values, l_mask.input_var], cost) prd = theano.function([l_in.input_var, l_mask.input_var], test_output) #rnn_out = T.concatenate(l_theta.theta, lasagne.layers.get_output(l_forward0)[:,-1,:].reshape((N_BATCH, N_HIDDEN)),axis=1) output_theta = theano.function([l_in.input_var, l_mask.input_var], [ l_theta.theta, lasagne.layers.get_output(l_forward0)[:, -1, :].reshape( (N_BATCH, N_HIDDEN)) ], on_unused_input='ignore') print("Training ...") try: for epoch in range(num_epochs): train_err = 0 train_batches = 0 start_time = time.time() thetas_train = [] for batch in iterate_minibatches_listinputs( [trainingAdmiSeqs, trainingLabels, trainingMask], N_BATCH, shuffle=True): inputs = batch train_err += train(inputs[0], inputs[1], inputs[2]) train_batches += 1 theta_train, rnnvec_train = output_theta(inputs[0], inputs[2]) rnnout_train = np.concatenate([theta_train, rnnvec_train], axis=1) thetas_train.append(rnnout_train.flatten()) if (train_batches + 1) % 1000 == 0: print(train_batches) np.save("theta_with_rnnvec/thetas_train" + str(epoch), thetas_train) # # And a full pass over the validation data: # val_err = 0 # val_acc = 0 # val_batches = 0 # new_validlabels = [] # pred_validlabels = [] # for batch in iterate_minibatches_listinputs([validAdmiSeqs, validLabels, validMask, validLengths], 1, shuffle=False): # inputs = batch # err = compute_cost(inputs[0], inputs[1], inputs[2]) # val_err += err # leng = inputs[3][0] # new_validlabels.extend(inputs[1].flatten()[:leng]) # pred_validlabels.extend(prd(inputs[0], inputs[2]).flatten()[:leng]) # val_batches += 1 # val_auc = roc_auc_score(new_validlabels, pred_validlabels) # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) # print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) # print(" validation auc:\t\t{:.6f}".format(val_auc)) # print(" validation accuracy:\t\t{:.2f} %".format( # val_acc / val_batches * 100)) # After training, we compute and print the test error: test_err = 0 test_batches = 0 new_testlabels = [] pred_testlabels = [] thetas = [] for batch in iterate_minibatches_listinputs( [test_admiSeqs, test_labels, test_mask, testLengths], 1, shuffle=False): inputs = batch err = compute_cost(inputs[0], inputs[1], inputs[2]) test_err += err leng = inputs[3][0] new_testlabels.extend(inputs[1].flatten()[:leng]) pred_testlabels.extend( prd(inputs[0], inputs[2]).flatten()[:leng]) theta, rnnvec = output_theta(inputs[0], inputs[2]) rnnout = np.concatenate([theta, rnnvec], axis=1) thetas.append(rnnout.flatten()) test_batches += 1 test_auc = roc_auc_score(new_testlabels, pred_testlabels) test_pr_auc = pr_auc(new_testlabels, pred_testlabels) # np.save("CONTENT_results/testlabels_"+str(epoch),new_testlabels) # np.save("CONTENT_results/predlabels_"+str(epoch),pred_testlabels) # np.save("CONTENT_results/thetas"+str(epoch),thetas) # np.save("theta_with_rnnvec/testlabels_"+str(epoch),new_testlabels) # np.save("theta_with_rnnvec/predlabels_"+str(epoch),pred_testlabels) # np.save("theta_with_rnnvec/thetas"+str(epoch),thetas) test_pre_rec_f1 = precision_recall_fscore_support( np.array(new_testlabels), np.array(pred_testlabels) > 0.5, average='binary') test_acc = accuracy_score(np.array(new_testlabels), np.array(pred_testlabels) > 0.5) print("Final results:") print(" test loss:\t\t{:.6f}".format(test_err / test_batches)) print(" test auc:\t\t{:.6f}".format(test_auc)) print(" test pr_auc:\t\t{:.6f}".format(test_pr_auc)) print(" test accuracy:\t\t{:.2f} %".format(test_acc * 100)) print( " test Precision, Recall and F1:\t\t{:.4f} %\t\t{:.4f}\t\t{:.4f}" .format(test_pre_rec_f1[0], test_pre_rec_f1[1], test_pre_rec_f1[2])) except KeyboardInterrupt: pass
def sparse_categorical_crossentropy(output, target, from_logits=False): target = T.cast(T.flatten(target), 'int32') target = T.extra_ops.to_one_hot(target, nb_class=output.shape[-1]) target = reshape(target, shape(output)) return categorical_crossentropy(output, target, from_logits)
def build_objective(model, deterministic=False, epsilon=1e-12): predictions = nn.layers.get_output(model.l_out, deterministic=deterministic) targets = T.cast(T.flatten(nn.layers.get_output(model.l_target)), 'int32') cc = nn.objectives.categorical_crossentropy(predictions,targets) return T.mean(cc)
def build_objective(model, deterministic=False, epsilon=1e-12): p = nn.layers.get_output(model.l_out, deterministic=deterministic) targets = T.flatten(nn.layers.get_output(model.l_target)) p = T.clip(p, epsilon, 1. - epsilon) bce = T.nnet.binary_crossentropy(p, targets) return T.mean(bce)
def max_pool(images, imgshp, maxpoolshp): """ Implements a max pooling layer Takes as input a 2D tensor of shape batch_size x img_size and performs max pooling. Max pooling downsamples by taking the max value in a given area, here defined by maxpoolshp. Outputs a 2D tensor of shape batch_size x output_size. Parameters ---------- images : 2D tensor Tensorcontaining images on which to apply convolution. Assumed to be \ of shape `batch_size x img_size` imgshp : tuple Tuple containing image dimensions maxpoolshp : tuple Tuple containing shape of area to max pool over Returns ------- out1 : WRITEME Symbolic result (2D tensor) out2 : WRITEME Logical shape of the output """ N = numpy poolsize = N.int64(N.prod(maxpoolshp)) # imgshp contains either 2 entries (height,width) or 3 (nfeatures,h,w) # in the first case, default nfeatures to 1 if N.size(imgshp) == 2: imgshp = (1, ) + imgshp # construct indices and index pointers for sparse matrix, which, when multiplied # with input images will generate a stack of image patches indices, indptr, spmat_shape, sptype, outshp = \ convolution_indices.conv_eval(imgshp, maxpoolshp, maxpoolshp, mode='valid') print 'XXXXXXXXXXXXXXXX MAX POOLING LAYER XXXXXXXXXXXXXXXXXXXX' print 'imgshp = ', imgshp print 'maxpoolshp = ', maxpoolshp print 'outshp = ', outshp # build sparse matrix, then generate stack of image patches csc = theano.sparse.CSM(sptype)(N.ones(indices.size), indices, indptr, spmat_shape) patches = sparse.structured_dot(csc, images.T).T pshape = tensor.stack(images.shape[0]*\ tensor.as_tensor(N.prod(outshp)), tensor.as_tensor(imgshp[0]), tensor.as_tensor(poolsize)) patch_stack = tensor.reshape(patches, pshape, ndim=3) out1 = tensor.max(patch_stack, axis=2) pshape = tensor.stack(images.shape[0], tensor.as_tensor(N.prod(outshp)), tensor.as_tensor(imgshp[0])) out2 = tensor.reshape(out1, pshape, ndim=3) out3 = tensor.DimShuffle((False, ) * 3, (0, 2, 1))(out2) return tensor.flatten(out3, 2), outshp