def _calc_regularization_cost(self): """Calculate the regularization cost given the weight decay parameters. Only the parameters will be considered that are stored in the set self.regularize. Returns ------- theano variable regularization cost depending on the parameters to be regularized and the weight decay parameters for L1 and L2 regularization. """ cost = theano.shared(value=np.cast[floatX](.0)) l1_cost = 0 l2_cost = 0 for p in self.regularize: l1_cost += T.sum(T.abs_(self.__dict__[p])) l2_cost += T.sum(T.sqr(self.__dict__[p])) l1_cost = debug_print(l1_cost, 'l1_cost') l2_cost = debug_print(l2_cost, 'l2_cost') if self.l1_weight != 0: cost += self.l1_weight * l1_cost if self.l2_weight != 0: cost += self.l2_weight * l2_cost return cost
def _elbo_t(logp, uw, inarray, n_mcsamples, random_seed): """Create Theano tensor of approximate ELBO by Monte Carlo sampling. """ l = (uw.size / 2).astype('int64') u = uw[:l] w = uw[l:] # Callable tensor logp_ = lambda input: theano.clone(logp, {inarray: input}, strict=False) # Naive Monte-Carlo r = MRG_RandomStreams(seed=random_seed) if n_mcsamples == 1: n = r.normal(size=inarray.tag.test_value.shape) q = n * exp(w) + u elbo = logp_(q) + tt.sum(w) + 0.5 * l * (1 + np.log(2.0 * np.pi)) else: n = r.normal(size=(n_mcsamples, u.tag.test_value.shape[0])) qs = n * exp(w) + u logps, _ = theano.scan(fn=lambda q: logp_(q), outputs_info=None, sequences=[qs]) elbo = tt.mean(logps) + tt.sum(w) + 0.5 * l * (1 + np.log(2.0 * np.pi)) return elbo
def apply_rnnlm(self, sentence, sentence_mask, sentence_morph, sentence_morph_mask, sentence_char, sentence_char_mask, use_noise=1): src, src_mask = sentence[:-1], sentence_mask[:-1] tgt, tgt_mask = sentence[1:], sentence_mask[1:] src_morph, src_morph_mask = sentence_morph[:-1], sentence_morph_mask[:-1] src_char, src_char_mask = sentence_char[:-1], sentence_char_mask[:-1] emb_lstm_range = T.arange(self.n_emb_lstm) #word lookup table table = LookupTable(self.n_emb_lstm, self.vocab_size, name='Wemb') src_emb = table.apply(src, emb_lstm_range) self.layers.append(table) if self.dropout < 1.0: src_emb = DropoutLayer(src_emb, use_noise, self.dropout) rnn_layer_1st = NormalRNN(self.n_emb_lstm, self.n_hids) hiddens = rnn_layer_1st.apply(src_emb, src_mask) self.layers.append(rnn_layer_1st) logistic_layer = LogisticRegression(hiddens, self.n_hids, self.vocab_size) self.layers.append(logistic_layer) self.cost = logistic_layer.cost(tgt, tgt_mask) for layer in self.layers: self.params.extend(layer.params) self.L2 = sum(T.sum(item ** 2) for item in self.params) self.L1 = sum(T.sum(abs(item)) for item in self.params)
def _calc_regularization_cost(self): """Calculate the regularization cost given the weight decay parameters. Only the parameters will be considered that are stored in the set self.regularize. We need to handle it manually in this class, because the weight matrices contain bias columns, which should not be considered in regularization computation. Therefore, do not!!! add W1 and W2 to self.regularize Returns ------- theano variable regularization cost depending on the parameters to be regularized and the weight decay parameters for L1 and L2 regularization. """ cost = super(SLmNce, self)._calc_regularization_cost() l1_cost = T.sum(T.abs_(self.W1[:, :-1])) l1_cost += T.sum(T.abs_(self.W2[:, :-1])) l2_cost = T.sum(T.sqr(self.W1[:, :-1])) l2_cost += T.sum(T.sqr(self.W2[:, :-1])) if self.l1_weight != 0: cost += self.l1_weight * l1_cost if self.l2_weight != 0: cost += self.l2_weight * l2_cost return cost
def _compute_local_cn_acts(self, input, W): # Without Scan (Faster than scan, but still way too slow) shuffledIn = input.dimshuffle(0,1,'x') shuffledMasks = self.localmask.dimshuffle('x',0,1) # cubeIn = T.repeat(shuffledIn,self.localmask.shape[1],2) # cubeMasks = T.repeat(shuffledMasks,input.shape[0],0) maskedIn = shuffledIn * shuffledMasks maskedInMean = T.sum(maskedIn,axis=1,keepdims=True) / T.sum(shuffledMasks,axis=1,keepdims=True) maskedInVar = T.sum(T.sqr((maskedIn-maskedInMean)*shuffledMasks),axis=1,keepdims=True)/T.sum(shuffledMasks,axis=1,keepdims=True) maskedInSTD = T.sqrt(maskedInVar) maskedInSubMean = maskedIn - maskedInMean maskedCN = maskedInSubMean / maskedInSTD # maskedCN = maskedInSubMean shuffledInCN = maskedCN.dimshuffle(2,0,1) allOuts = T.dot(shuffledInCN, W) diagMask = T.eye(self.localmask.shape[1],self.localmask.shape[1]).dimshuffle(0,'x',1) diagMaskAll = allOuts * diagMask activation = T.sum(diagMaskAll,axis=0) return activation
def __init__(self, kernel, max_iter = 10, max_diff = None): """ :param kernel: a function with a signature (expected, observed) -> a similarity measure that accepts symbolic theano expressions and returns them accordingly. See `crayimage.hotornot.em.kernels` for examples. :param max_iter: maximal number of iteration :param max_diff: stop iterations if maximal difference in weights from the previous iteration is smaller than `max_diff`. If None the check is not performed. """ self.original_shape = None self.kernel = kernel self.max_iter = max_iter self.max_diff = max_diff self.X = theano.shared( np.zeros(shape=(0, 0), dtype='float32') ) self.weights = theano.shared( np.ones(shape=(0, ), dtype='float32') ) canonical = T.sum(self.weights[:, None] * self.X, axis=0) / T.sum(self.weights) weights_updates = self.kernel(canonical, self.X) weights_diff = T.max(abs(weights_updates - self.weights)) upd = { self.weights : weights_updates } self.iteration = theano.function([], weights_diff if max_diff is not None else [], updates=upd) self.get_canonical = theano.function([], canonical)
def _build_conditional(self, Xnew, pred_noise, diag, X, Xu, y, sigma, cov_total, mean_total): sigma2 = tt.square(sigma) Kuu = cov_total(Xu) Kuf = cov_total(Xu, X) Luu = cholesky(stabilize(Kuu)) A = solve_lower(Luu, Kuf) Qffd = tt.sum(A * A, 0) if self.approx == "FITC": Kffd = cov_total(X, diag=True) Lamd = tt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2 else: # VFE or DTC Lamd = tt.ones_like(Qffd) * sigma2 A_l = A / Lamd L_B = cholesky(tt.eye(Xu.shape[0]) + tt.dot(A_l, tt.transpose(A))) r = y - mean_total(X) r_l = r / Lamd c = solve_lower(L_B, tt.dot(A, r_l)) Kus = self.cov_func(Xu, Xnew) As = solve_lower(Luu, Kus) mu = self.mean_func(Xnew) + tt.dot(tt.transpose(As), solve_upper(tt.transpose(L_B), c)) C = solve_lower(L_B, As) if diag: Kss = self.cov_func(Xnew, diag=True) var = Kss - tt.sum(tt.square(As), 0) + tt.sum(tt.square(C), 0) if pred_noise: var += sigma2 return mu, var else: cov = (self.cov_func(Xnew) - tt.dot(tt.transpose(As), As) + tt.dot(tt.transpose(C), C)) if pred_noise: cov += sigma2 * tt.identity_like(cov) return mu, stabilize(cov)
def _construct_mom_stuff(self): """ Construct the cost function for the moment-matching "regularizer". """ a = self.mom_mix_rate dist_mean = self.GN.dist_mean dist_cov = self.GN.dist_cov # Get the generated sample observations for this batch, transformed # linearly into the desired space for moment matching... X_b = T.dot(self.GN.output, self.mom_match_proj) # Get their mean batch_mean = T.mean(X_b, axis=0) # Get the updated generator distribution mean new_mean = ((1.0 - a[0]) * self.GN.dist_mean) + (a[0] * batch_mean) # Use the mean to get the updated generator distribution covariance X_b_minus_mean = X_b - new_mean # Whelp, I guess this line needs the cast... for some reason... batch_cov = T.dot(X_b_minus_mean.T, X_b_minus_mean) / T.cast(X_b.shape[0], 'floatX') new_cov = ((1.0 - a[0]) * self.GN.dist_cov) + (a[0] * batch_cov) # Get the cost for deviation from the target distribution's moments mean_err = new_mean - self.target_mean cov_err = (new_cov - self.target_cov) mm_cost = self.mom_match_weight[0] * \ (T.sum(mean_err**2.0) + T.sum(cov_err**2.0)) # Construct the updates for the running estimates of the generator # distribution's first and second-order moments. mom_updates = OrderedDict() mom_updates[self.GN.dist_mean] = new_mean mom_updates[self.GN.dist_cov] = new_cov return [mm_cost, mom_updates]
def orthogonal_penalty(W, D, epsilon=1e-6, axis=1): num = T.sqr(T.sum(W * D, axis=axis)) # n = (d^T w)^2 den = T.sum(T.sqr(W), axis=axis) * T.sum(T.sqr(D), axis=axis) # d = ||w||_2^2 * ||d||_2^2 cos = num / den # c = n / d value = cos - (epsilon**2) # v = c - epsilon^2 hinge = value * (value > 0) # h = [ v ]_+ return T.sum(hinge)
def smooth_softmax(x): """Softmax that shouldn't overflow, with Laplacish smoothing.""" eps = 0.0001 e_x = T.exp(x - T.max(x, axis=1, keepdims=True)) p = (e_x / T.sum(e_x, axis=1, keepdims=True)) + constFX(eps) p_sm = p / T.sum(p, axis=1, keepdims=True) return p_sm
def test_pickle_unpickle_without_reoptimization(): mode = theano.config.mode if mode in ["DEBUG_MODE", "DebugMode"]: mode = "FAST_RUN" x1 = T.fmatrix('x1') x2 = T.fmatrix('x2') x3 = theano.shared(numpy.ones((10, 10), dtype=floatX)) x4 = theano.shared(numpy.ones((10, 10), dtype=floatX)) y = T.sum(T.sum(T.sum(x1**2 + x2) + x3) + x4) updates = OrderedDict() updates[x3] = x3 + 1 updates[x4] = x4 + 1 f = theano.function([x1, x2], y, updates=updates, mode=mode) # now pickle the compiled theano fn string_pkl = pickle.dumps(f, -1) # compute f value in1 = numpy.ones((10, 10), dtype=floatX) in2 = numpy.ones((10, 10), dtype=floatX) # test unpickle without optimization default = theano.config.reoptimize_unpickled_function try: # the default is True theano.config.reoptimize_unpickled_function = False f_ = pickle.loads(string_pkl) assert f(in1, in2) == f_(in1, in2) finally: theano.config.reoptimize_unpickled_function = default
def eq_log_pstar_vgh(self, g_hat, h_hat, s1_hat, s0_hat, v): """ Computes the expectation (under the variational distribution q(g,h)=q(g)q(h)) of the log un-normalized probability, i.e. log p^*(g,h,s,v) :param g_hat: T.matrix of shape (batch_size, n_g) :param h_hat: T.matrix of shape (batch_size, n_h) :param v : T.matrix of shape (batch_size, n_v) """ from_v = self.from_v(v) from_h = self.from_h(h_hat) from_g = self.from_g(g_hat) # center variables cg_hat = g_hat - self.cg if self.flags['center_g'] else g_hat ch_hat = h_hat - self.ch if self.flags['center_h'] else h_hat # compute expectation of various s-quantities s_hat = self.s_hat(ch_hat, s1_hat, s0_hat) ss_hat = self.s_hat(ch_hat, s1_hat**2 + 1./self.alpha_prec, s0_hat**2 + 1./self.alpha_prec) lq = 0. lq += T.sum(from_v * self._mu * from_h, axis=1) lq += T.sum(from_v * s1_hat * from_h, axis=1) lq -= 0.5 * T.sum(self.alpha_prec * ss_hat, axis=1) lq -= T.sum(0.5 * self.lambd_prec * v**2, axis=1) lq += T.sum(self.alpha_prec * from_g * s_hat, axis=1) lq += T.dot(cg_hat, self.gbias) lq += T.dot(ch_hat, self.hbias) return T.mean(lq), [g_hat, h_hat, s_hat, ss_hat, s1_hat, s0_hat, v]
def dev_loss(self, dev_types, dev_lams, ss_ratio, y): su_mask = ss_ratio * T.neq(y, 0).reshape((y.shape[0], 1)) un_mask = T.eq(y, 0).reshape((y.shape[0], 1)) ss_mask = su_mask + un_mask var_fun = lambda x1, x2: T.sum(((x1 - x2) * ss_mask)**2.0) / T.sum(ss_mask) tanh_fun = lambda x1, x2: var_fun(T.tanh(x1), T.tanh(x2)) norm_fun = lambda x1, x2: var_fun( \ (x1 / T.sqrt(T.sum(x1**2.0,axis=1,keepdims=1) + 1e-6)), \ (x2 / T.sqrt(T.sum(x2**2.0,axis=1,keepdims=1) + 1e-6))) sigm_fun = lambda x1, x2: var_fun(T.nnet.sigmoid(x1), T.nnet.sigmoid(x2)) cent_fun = lambda xt, xo: T.sum(T.nnet.binary_crossentropy( \ T.nnet.sigmoid(xo), T.nnet.sigmoid(xt))) / xt.shape[0] L = 0.0 for i in xrange(self.layer_count): if (i < (self.layer_count - 1)): x1 = self.layers[i].output x2 = self.drop_nets[0][i].output else: x1 = self.layers[i].linear_output x2 = self.drop_nets[0][i].linear_output if (dev_types[i] == 1): L = L + (dev_lams[i] * norm_fun(x1, x2)) elif (dev_types[i] == 2): L = L + (dev_lams[i] * tanh_fun(x1, x2)) elif (dev_types[i] == 3): L = L + (dev_lams[i] * sigm_fun(x1, x2)) elif (dev_types[i] == 4): L = L + (dev_lams[i] * cent_fun(x1, x2)) else: L = L + (dev_lams[i] * var_fun(x1, x2)) return L
def __init__(self, word_vec_width, batch_size, num_hidden, learning_rate=0.1): self.num_hidden = num_hidden self.learning_rate = learning_rate self.word_vec_width = word_vec_width self.batch_size = batch_size self.vocab_mat = T.fmatrix('vocab') self.word_onehot = T.fmatrix('word_onehot') b = T.fvector('b') W = T.fmatrix('W') f = 1 / (1 + T.exp(-(W * (self.word_onehot.dot(self.vocab_mat) + b)))) s = T.sum(f) self.exec_fn = theano.function( [self.word_onehot, b, W, self.vocab_mat], f, allow_input_downcast=True) self.word_onehot_c = T.fmatrix('word_onehot_c') f_c = 1 / (1 + T.exp(-(W * (self.word_onehot_c.dot(self.vocab_mat)) + b))) s_c = T.sum(f_c) J = T.largest(0, 1 - s + s_c) self.grad = theano.grad(J, [b, W, self.vocab_mat]) self.grad_fn = theano.function( [self.word_onehot, self.word_onehot_c, b, W, self.vocab_mat], self.grad, allow_input_downcast=True)
def __init__(self, vocab_size, dim, lr=0.5): W = np.asarray(np.random.rand(vocab_size, dim), dtype=theano.config.floatX) / float(dim) W1 = np.asarray((np.random.rand(vocab_size, dim)), dtype=theano.config.floatX) / float(dim) self.W = theano.shared(W, name='W', borrow=True) self.W1 = theano.shared(W1, name='W1', borrow=True) gW = np.asarray(np.ones((vocab_size, dim)), dtype=theano.config.floatX) gW1 = np.asarray( np.ones((vocab_size, dim)), dtype=theano.config.floatX) self.gW = theano.shared(gW, name='gW', borrow=True) self.gW1 = theano.shared(gW1, name='gW1', borrow=True) X = T.vector() fX = T.vector() ind_W = T.ivector() ind_W1 = T.ivector() w = self.W[ind_W, :] w1 = self.W1[ind_W1, :] cost = T.sum(fX * ((T.sum(w * w1, axis=1) - X) ** 2)) grad = T.clip(T.grad(cost, [w, w1]), -5.0, 5.0) updates1 = [(self.gW, T.inc_subtensor(self.gW[ind_W, :], grad[0] ** 2))] updates2 = [(self.gW1, T.inc_subtensor(self.gW1[ind_W1, :], grad[1] ** 2))] updates3 = [(self.W, T.inc_subtensor(self.W[ind_W, :], - (lr / T.sqrt(self.gW[ind_W, :])) * grad[0]))] updates4 = [(self.W1, T.inc_subtensor(self.W1[ind_W1, :], - (lr / T.sqrt(self.gW1[ind_W1, :])) * grad[1]))] updates = updates1 + updates2 + updates3 + updates4 self.cost_fn = theano.function( inputs=[ind_W, ind_W1, X, fX], outputs=cost, updates=updates)
def unet_crossentropy_loss_sampled(y_true, y_pred): print 'unet_crossentropy_loss_sampled' epsilon = 1.0e-4 y_pred_clipped = T.flatten(T.clip(y_pred, epsilon, 1.0-epsilon)) y_true = T.flatten(y_true) # this seems to work # it is super ugly though and I am sure there is a better way to do it # but I am struggling with theano to cooperate # filter the right indices indPos = T.nonzero(y_true)[0] # no idea why this is a tuple indNeg = T.nonzero(1-y_true)[0] # shuffle n = indPos.shape[0] indPos = indPos[srng.permutation(n=n)] n = indNeg.shape[0] indNeg = indNeg[srng.permutation(n=n)] # take equal number of samples depending on which class has less n_samples = T.cast(T.min([T.sum(y_true), T.sum(1-y_true)]), dtype='int64') indPos = indPos[:n_samples] indNeg = indNeg[:n_samples] loss_vector = -T.mean(T.log(y_pred_clipped[indPos])) - T.mean(T.log(1-y_pred_clipped[indNeg])) average_loss = T.mean(loss_vector) print 'average_loss:', average_loss return average_loss
def KLD_X(self,m,S): N = m.shape[0] Q = m.shape[1] KL_X = T.sum(m*m)+T.sum(S-T.log(S)) - Q*N return 0.5*KL_X
def __init__(self, incoming, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.), W=lasagne.init.Normal(0.05), train_g=False, init_stdv=1., nonlinearity=relu, **kwargs): super(WeightNormLayer, self).__init__(incoming, **kwargs) self.nonlinearity = nonlinearity self.init_stdv = init_stdv k = self.input_shape[1] if b is not None: self.b = self.add_param(b, (k,), name="b", regularizable=False) if g is not None: self.g = self.add_param(g, (k,), name="g", regularizable=False, trainable=train_g) if len(self.input_shape)==4: self.axes_to_sum = (0,2,3) self.dimshuffle_args = ['x',0,'x','x'] else: self.axes_to_sum = 0 self.dimshuffle_args = ['x',0] # scale weights in layer below incoming.W_param = incoming.W #incoming.W_param.set_value(W.sample(incoming.W_param.get_value().shape)) if incoming.W_param.ndim==4: if isinstance(incoming, Deconv2DLayer): W_axes_to_sum = (0,2,3) W_dimshuffle_args = ['x',0,'x','x'] else: W_axes_to_sum = (1,2,3) W_dimshuffle_args = [0,'x','x','x'] else: W_axes_to_sum = 0 W_dimshuffle_args = ['x',0] if g is not None: incoming.W = incoming.W_param * (self.g/T.sqrt(1e-6 + T.sum(T.square(incoming.W_param),axis=W_axes_to_sum))).dimshuffle(*W_dimshuffle_args) else: incoming.W = incoming.W_param / T.sqrt(1e-6 + T.sum(T.square(incoming.W_param),axis=W_axes_to_sum,keepdims=True))
def __init__(self, numpy_rng, theano_rng=None, n_ins=100, layers_types=[ReLU, ReLU, ReLU, LogisticRegression], layers_sizes=[1024, 1024, 1024], n_outs=2, rho=0.9, eps=1.E-6, L1_reg=0., L2_reg=0., debugprint=False, mu=0.9): """ Feedforward neural network with added L1 and/or L2 regularization. """ super(RegularizedNet, self).__init__(numpy_rng, theano_rng, n_ins, layers_types, layers_sizes, n_outs, rho, eps, debugprint, mu) L1 = shared(0.) for param in self.params: L1 += T.sum(abs(param)) if L1_reg > 0.: self.cost = self.cost + L1_reg * L1 L2 = shared(0.) for param in self.params: L2 += T.sum(param ** 2) if L2_reg > 0.: self.cost = self.cost + L2_reg * L2
def __init__(self, n_in, n_out, n_hidden, activation='tanh', l1_reg=0.00, l2_reg=0.00): BasicRNN.__init__(self, n_in, n_out, n_hidden, activation) bh_init = np.zeros((n_hidden,), dtype=theano.config.floatX) by_init = np.zeros((n_out,), dtype=theano.config.floatX) self.bh = theano.shared(value=bh_init, name='bh') self.by = theano.shared(value=by_init, name='by') self.params = [self.U, self.W, self.V, self.bh, self.by] # for every parameter, we maintain it's last update # the idea here is to use "momentum" # keep moving mostly in the same direction self.velocity_updates = {} for param in self.params: init = np.zeros(param.get_value(borrow=True).shape, dtype=theano.config.floatX) self.velocity_updates[param] = theano.shared(init) self.L1_reg = float(l1_reg) self.L2_reg = float(l2_reg) # L1 norm ; one regularization option is to enforce L1 norm to # be small self.L1 = 0 self.L1 += abs(self.W.sum()) self.L1 += abs(self.U.sum()) # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small self.L2_sqr = 0 self.L2_sqr += T.sum(self.W ** 2) self.L2_sqr += T.sum(self.U ** 2)
def get_output_for(self, input, init=False, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) activation = T.tensordot(input, self.W, [[1], [0]]) abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2) + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1)) if init: mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0) abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x') self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))] f = T.sum(T.exp(-abs_dif),axis=2) if init: mf = T.mean(f,axis=0) f -= mf.dimshuffle('x',0) self.init_updates.append((self.b, -mf)) else: f += self.b.dimshuffle('x',0) return T.concatenate([input, f], axis=1)
def errors(self, y, print_output=False): # check if y has same dimension of y_pred if y.ndim != self.y_pred.ndim: raise TypeError('y should have the same shape as self.y_pred', ('y', y.type, 'y_pred', self.y_pred.type)) # check if y is of the correct datatype if y.dtype.startswith('int'): num_positive = T.cast(T.sum(T.eq(y,1)),'float64') num_predicted_positive = T.cast(T.sum(T.eq(self.y_pred,1)),'float64') num_correctly_predicted = T.cast(T.sum(T.eq(self.y_pred*y,1)),'float64') P = T.cast(0.0,'float64') # precision = True positive / (True positive + False positive) if (T.gt(num_predicted_positive,0.0)): P = T.cast(num_correctly_predicted / num_predicted_positive,'float64') R = T.cast(0.0,'float64') # recall = True positive / (True positive + False negative) if (T.gt(num_positive,0.0)): R = T.cast(num_correctly_predicted / num_positive,'float64') F1 = T.cast(0.0,'float64') # F1 score if (T.gt(P+R,0.0)): F1 = 2.0*P*R/(P+R) if (print_output): print(" num positive = {0}".format( num_positive ) ) print(" num predicted positive = {0}".format( num_predicted_positive ) ) print(" num correctly predicted = {0}".format( num_correctly_predicted ) ) print(" precision = {0}".format(P)) print(" recall = {0}".format(R)) print(" F1 score = {0}".format(F1)) return [T.mean(T.neq(self.y_pred, y)), P, R, F1] else: raise NotImplementedError() return
def build_objective(model, deterministic=False, epsilon=1e-12): predictions = nn.layers.get_output(model.l_out, deterministic=deterministic) targets = nn.layers.get_output(model.l_target) enable_targets = nn.layers.get_output(model.l_enable_target) sum_of_objectives = 0 unit_ptr = 0 for obj_idx, obj_name in enumerate(order_objectives): ptype = property_type[obj_name] if ptype == 'classification': num_units = len(property_bin_borders[obj_name]) v_obj = cce(obj_idx, (unit_ptr, unit_ptr+num_units), predictions, targets, epsilon) # take the mean of the objectives where it matters (enabled targets) obj_scalar = T.sum(enable_targets[:,obj_idx] * v_obj) / (0.00001 + T.sum(enable_targets[:,obj_idx])) unit_ptr = unit_ptr + num_units elif ptype == 'continuous': v_obj = sqe(obj_idx, unit_ptr, predictions, targets) obj_scalar = T.mean(v_obj) unit_ptr += 1 else: raise if deterministic: d_objectives_deterministic[obj_name] = obj_scalar else: d_objectives[obj_name] = obj_scalar sum_of_objectives += norm_weights_loss[obj_name] * obj_scalar return sum_of_objectives
def applyConstraint(self, param): if param.ndim != 4 and param.ndim != 2: warnings.warn("Norm constraints are normally applied to matrices" +" or 4-dimensional tensors, but currently got " +"%d dimensions, please make sure this is the desired" +" parameter to apply norm constraints" % param.ndim) needFlip = False if param.ndim == 4: # a hack for conv layer filters prevShape = param.shape # conv layer filter shape is (nChannelOut, nChannelIn, r, c) param = param.flatten(2) # now it is (nout, nin), which is different from (nin, nout) # from fulling connected networks, so need to flip here needFlip = True if needFlip: col_norm = T.sqrt(T.sum(T.sqr(param), axis=1, keepdims=True)) else: col_norm = T.sqrt(T.sum(T.sqr(param), axis=0, keepdims=True)) param /= (col_norm+1e-7) param *= self.norm if needFlip: param = param.reshape(prevShape) return param
def sequence_log_likelihood(y, y_hat, y_mask, y_hat_mask, blank_symbol, log_scale=True): """ Based on code from Shawn Tan. Credits to Kyle Kastner as well. This function computes the CTC log likelihood for a sequence that has been augmented with blank labels. """ y_hat_mask_len = tensor.sum(y_hat_mask, axis=0, dtype="int32") y_mask_len = tensor.sum(y_mask, axis=0, dtype="int32") if log_scale: log_probabs = _log_path_probabs(y, T.log(y_hat), y_mask, y_hat_mask, blank_symbol) batch_size = log_probabs.shape[1] # Add the probabilities of the final time steps to get the total # sequence likelihood. log_labels_probab = _log_add( log_probabs[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 1], log_probabs[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 2], ) else: probabilities = _path_probabs(y, y_hat, y_mask, y_hat_mask, blank_symbol) batch_size = probabilities.shape[1] labels_probab = ( probabilities[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 1] + probabilities[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 2] ) log_labels_probab = tensor.log(labels_probab) return log_labels_probab
def getRpRnTpTnForTrain0OrVal1(self, y, training0OrValidation1): # The returned list has (numberOfClasses)x4 integers: >numberOfRealPositives, numberOfRealNegatives, numberOfTruePredictedPositives, numberOfTruePredictedNegatives< for each class (incl background). # Order in the list is the natural order of the classes (ie class-0 RP,RN,TPP,TPN, class-1 RP,RN,TPP,TPN, class-2 RP,RN,TPP,TPN ...) # param y: y = T.itensor4('y'). Dimensions [batchSize, r, c, z] yPredToUse = self.y_pred_train if training0OrValidation1 == 0 else self.y_pred_val checkDimsOfYpredAndYEqual(y, yPredToUse, "training" if training0OrValidation1 == 0 else "validation") returnedListWithNumberOfRpRnTpTnForEachClass = [] for class_i in xrange(0, self._numberOfOutputClasses) : #Number of Real Positive, Real Negatives, True Predicted Positives and True Predicted Negatives are reported PER CLASS (first for WHOLE). tensorOneAtRealPos = T.eq(y, class_i) tensorOneAtRealNeg = T.neq(y, class_i) tensorOneAtPredictedPos = T.eq(yPredToUse, class_i) tensorOneAtPredictedNeg = T.neq(yPredToUse, class_i) tensorOneAtTruePos = T.and_(tensorOneAtRealPos,tensorOneAtPredictedPos) tensorOneAtTrueNeg = T.and_(tensorOneAtRealNeg,tensorOneAtPredictedNeg) returnedListWithNumberOfRpRnTpTnForEachClass.append( T.sum(tensorOneAtRealPos) ) returnedListWithNumberOfRpRnTpTnForEachClass.append( T.sum(tensorOneAtRealNeg) ) returnedListWithNumberOfRpRnTpTnForEachClass.append( T.sum(tensorOneAtTruePos) ) returnedListWithNumberOfRpRnTpTnForEachClass.append( T.sum(tensorOneAtTrueNeg) ) return returnedListWithNumberOfRpRnTpTnForEachClass
def ThangAttentionUnit(attention_state_prev, current_stack_top, premise_stack_tops, projected_stack_tops, attention_dim, vs, name="attention_unit", initializer=None): """ Args: attention_state_prev: The output of this unit at the previous time step. current_stack_top: The current stack top (h state only, if applicable). premise_stack_tops: The values to do attention over. projected_stack_tops: Projected vectors to use to produce an attentive weighting alpha_t. attention_dim: The dimension of the vectors over which to do attention. vs: A variable store for the learned parameters. name: An identifier for the learned parameters in this unit. initializer: Used to initialize the learned parameters. Dimension notation: B : Batch size k : Model dim L : num_transitions """ # Shape: B x L score = T.sum(projected_stack_tops * current_stack_top, axis=2).T alpha_t = T.nnet.softmax(score) # Shape B x k Y__alpha_t = T.sum(premise_stack_tops * alpha_t.T[:, :, np.newaxis], axis=0) mlstm_input = T.concatenate([Y__alpha_t, current_stack_top], axis=1) r_t = LSTMLayer(attention_state_prev, mlstm_input, 2 * attention_dim, 2 * attention_dim, vs, name="%s/lstm" % name) return r_t
def finetune_cost_updates(self, center, mu, learning_rate): """ This function computes the cost and the updates .""" # note : we sum over the size of a datapoint; if we are using # minibatches, L will be a vector, withd one entry per # example in minibatch network_output = self.get_output() temp = T.pow(center - network_output, 2) L = T.sum(temp, axis=1) # Add the network reconstruction error z = self.get_network_reconst() reconst_err = T.sum(T.pow(self.x - z, 2), axis = 1) L = self.beta*L + self.lbd*reconst_err cost1 = T.mean(L) cost2 = self.lbd*T.mean(reconst_err) cost3 = cost1 - cost2 # compute the gradients of the cost of the `dA` with respect # to its parameters gparams = T.grad(cost1, self.params) # generate the list of updates updates = [] grad_values = [] param_norm = [] for param, delta, gparam in zip(self.params, self.delta, gparams): updates.append( (delta, mu*delta - learning_rate * gparam) ) updates.append( (param, param + mu*mu*delta - (1+mu)*learning_rate*gparam )) grad_values.append(gparam.norm(L=2)) param_norm.append(param.norm(L=2)) grad_ = T.stack(*grad_values) param_ = T.stack(*param_norm) return ((cost1, cost2, cost3, grad_, param_), updates)
def get_cost_updates(self, contraction_level, learning_rate): """ This function computes the cost and the updates for one trainng step of the cA """ y = self.get_hidden_values(self.x) z = self.get_reconstructed_input(y) J = self.get_jacobian(y, self.W) # note : we sum over the size of a datapoint; if we are using # minibatches, L will be a vector, with one entry per # example in minibatch self.L_rec = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1) # Compute the jacobian and average over the number of samples/minibatch self.L_jacob = T.sum(J ** 2) // self.n_batchsize # note : L is now a vector, where each element is the # cross-entropy cost of the reconstruction of the # corresponding example of the minibatch. We need to # compute the average of all these to get the cost of # the minibatch cost = T.mean(self.L_rec) + contraction_level * T.mean(self.L_jacob) # compute the gradients of the cost of the `cA` with respect # to its parameters gparams = T.grad(cost, self.params) # generate the list of updates updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - learning_rate * gparam)) return (cost, updates)
def output_probabilistic(self, m_w_previous, v_w_previous): if (self.non_linear): m_in = self.m_w - m_w_previous v_in = self.v_w # We compute the mean and variance after the ReLU activation lam = self.lam v_1 = 1 + 2*lam*v_in v_1_inv = v_1**-1 s_1 = T.prod(v_1,axis=1)**-0.5 v_2 = 1 + 4*lam*v_in v_2_inv = v_2**-1 s_2 = T.prod(v_2,axis=1)**-0.5 v_inv = v_in**-1 exponent1 = m_in**2*(1 - v_1_inv)*v_inv exponent1 = T.sum(exponent1,axis=1) exponent2 = m_in**2*(1 - v_2_inv)*v_inv exponent2 = T.sum(exponent2,axis=1) m_a = s_1*T.exp(-0.5*exponent1) v_a = s_2*T.exp(-0.5*exponent2) - m_a**2 return (m_a, v_a) else: m_w_previous_with_bias = \ T.concatenate([ m_w_previous, T.alloc(1, 1) ], 0) v_w_previous_with_bias = \ T.concatenate([ v_w_previous, T.alloc(0, 1) ], 0) m_linear = T.dot(self.m_w, m_w_previous_with_bias) / T.sqrt(self.n_inputs) v_linear = (T.dot(self.v_w, v_w_previous_with_bias) + \ T.dot(self.m_w**2, v_w_previous_with_bias) + \ T.dot(self.v_w, m_w_previous_with_bias**2)) / self.n_inputs return (m_linear, v_linear)
def __theano_build__(self): E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c x_a = T.ivector('x_a') x_b = T.ivector('x_b') y = T.lvector('y') def forward_step(x_t, s_t_prev): # Word embedding layer x_e = E[:, x_t] # GRU layer 1 z_t = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t_prev)) r_t = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t_prev)) c_t = T.tanh(U[2].dot(x_e) + W[2].dot(s_t_prev * r_t)) s_t = (T.ones_like(z_t) - z_t) * c_t + z_t * s_t_prev # directly return the hidden state as intermidate output return [s_t] # sentence a vector (states) a_s, updates = theano.scan(forward_step, sequences=x_a, truncate_gradient=self.bptt_truncate, outputs_info=T.zeros(self.hidden_dim)) # sentence b vector (states) b_s, updates = theano.scan(forward_step, sequences=x_b, truncate_gradient=self.bptt_truncate, outputs_info=T.zeros(self.hidden_dim)) # semantic similarity # s_sim = manhattan_distance(a_s[-1],b_s[-1]) # for classification using simple strategy sena = a_s[-1] senb = b_s[-1] combined_s = T.concatenate([sena, senb], axis=0) # softmax class o = T.nnet.softmax(V.dot(combined_s) + c)[0] # in case the o contains 0 which cause inf eps = np.asarray([1.0e-10] * self.label_dim, dtype=theano.config.floatX) o = o + eps om = o.reshape((1, o.shape[0])) prediction = T.argmax(om, axis=1) o_error = T.nnet.categorical_crossentropy(om, y) # cost cost = T.sum(o_error) # updates updates = sgd_updates_adadelta(norm=0, params=self.params, cost=cost) # monitor parameter mV = V * T.ones_like(V) mc = c * T.ones_like(c) mU = U * T.ones_like(U) mW = W * T.ones_like(W) gV = T.grad(cost, V) gc = T.grad(cost, c) gU = T.grad(cost, U) gW = T.grad(cost, W) mgV = gV * T.ones_like(gV) mgc = gc * T.ones_like(gc) mgU = gU * T.ones_like(gU) mgW = gW * T.ones_like(gW) # Assign functions self.monitor = theano.function([x_a, x_b], [sena, senb, mV, mc, mU, mW]) self.monitor_grad = theano.function([x_a, x_b, y], [mgV, mgc, mgU, mgW]) self.predict = theano.function([x_a, x_b], om) self.predict_class = theano.function([x_a, x_b], prediction) self.ce_error = theano.function([x_a, x_b, y], cost) # self.bptt = theano.function([x,y],[dE,dU,dW,db,dV,dc]) # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # rmsprop cache updates # find the nan self.sgd_step = theano.function( [x_a, x_b, y], [], updates=updates # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) )
def __init__(self, layer_def, inputs, inputs_shape, rs, clone_from=None): """ Create a Dirichlet layer, according to the following paper: Malmir M, Sikka K, Forster D, Fasel I, Movellan JR, Cottrell GW. Deep Active Object Recognition by Joint Label and Action Prediction. arXiv preprint arXiv:1512.05484. 2015 Dec 17. Each unit in this layer encodes a Dicihlet distribution over its input. The input is assumed to be a belief vector, i.e. \sum_i input[i] = 1, 0 <= input_i <= 1 for all i :type layer_def: Element, xml containing configu for Conv layer :type inputs: a list of [belief_in, actions, objects, previous_belief] :param inputs[0], belief_in, is a theano.matrix which contains belief vectors in its columns :param inputs[1], actions, theano.ivector, list of actions for each column of belief_in :param inputs[2], objects, theano.ivector, list of objects for each column of belief_in :param inputs[3], previous_belief, theano.matrix, used to accumulate beliefs over time :type input_shapes: list of sizes of inputs :type rs: a random number generator used to initialize weights """ assert ( len(inputs) == 4 ) #belief dim x bacth_sz, actions: 1 x batch_size, objects 1 x batch_sz, accbelief (numActs*numObjs) x batch_sz beliefs, actions, objects, accbeliefs = inputs self.inputs = inputs # beliefs, actions, objects dim = inputs_shape[0][0] assert (inputs_shape[0][1] == inputs_shape[1][1]) #batch_size assert (inputs_shape[0][1] == inputs_shape[2][1]) #batch_size assert (inputs_shape[0][1] == inputs_shape[3][1]) #batch_size assert (inputs_shape[1][0] == 1) #action is a single integer assert (inputs_shape[2][0] == 1) #object label is a single integer batch_size = inputs_shape[0][1] self.numActions = int(layer_def.find("numActions").text) self.numObjects = int(layer_def.find("numObjects").text) assert (self.numObjects * self.numActions == inputs_shape[3][0]) assert (self.numObjects == dim) #total number of dirichlet units = numActions x numObjects num_dirichlets = self.numObjects * self.numActions if clone_from == None: self.alphas = theano.shared(np.random.randint( 5, 30, [dim, num_dirichlets]).astype(theano.config.floatX) / 25., borrow=True) # dim x num_dirichlets else: self.alphas = clone_from.alphas #self.alphas = theano.shared(0.7* np.ones([dim,num_dirichlets]).astype(theano.config.floatX),borrow=True)# dim x num_dirichlets #remove 0 from the input belief normalized_beliefs = beliefs + 1.e-6 normalized_beliefs = normalized_beliefs / T.reshape( T.sum(normalized_beliefs, axis=0), [1, batch_size]) log_normed_beliefs = T.log(normalized_beliefs) # dim x batch_size self.log_normed = log_normed_beliefs #calculate Dirichlet probs for the current normalize beliefs self.term1 = T.reshape(T.gammaln(T.sum(self.alphas, axis=0)), [num_dirichlets, 1]) self.term2 = T.reshape(T.sum(T.gammaln(self.alphas), axis=0), [num_dirichlets, 1]) self.term3 = T.dot(T.transpose(self.alphas - 1.), log_normed_beliefs) # num_dirichlets x batch_size #find a mask based on the actions dirichlet_actions = np.tile( np.arange(self.numActions).reshape([-1, 1]), [self.numObjects, 1]) dirichlet_actions = np.tile(dirichlet_actions, [1, batch_size]) dirichlet_actions = theano.shared(dirichlet_actions.astype( theano.config.floatX), borrow=True) in_actions = T.tile(T.reshape(actions, [1, batch_size]), [num_dirichlets, 1]) self.eq_actions = T.eq(dirichlet_actions, in_actions) #self.current_belief = T.exp(self.term1 - self.term2 + self.eq_actions * self.term3) #this should be normalized for each column log_cur_belief = self.term1 - self.term2 + self.eq_actions * self.term3 #this should be normalized for each column #log_cur_belief = self.term1 - self.term2 + self.term3 #this should be normalized for each column log_cur_belief_normd = log_cur_belief - T.reshape( T.max(log_cur_belief, axis=0), [1, batch_size]) cur_blf = self.eq_actions * T.exp(log_cur_belief_normd) self.current_belief = cur_blf / T.sum(cur_blf, axis=0) acc_is_zero = T.eq(accbeliefs, 0.) accbeliefs_no_0 = acc_is_zero + (1. - acc_is_zero) * accbeliefs updated_belief = self.eq_actions * self.current_belief * accbeliefs_no_0 + ( 1. - self.eq_actions) * accbeliefs # num_dirichlet x batch_size sum_up_blf = T.reshape(T.sum(updated_belief, axis=0), [1, batch_size]) #sum_up_blf_normed = T.switch( T.eq(sum_up_blf, 0.) , np.ones([1,batch_size]).astype(theano.config.floatX),sum_up_blf) #self.updated_belief = updated_belief / sum_up_blf_normed self.updated_belief = updated_belief / sum_up_blf self.output = self.updated_belief #self.updated_belief = self.current_belief #construct the outputs # for each class, assign 1s to the components that indicate P(a,o|x) #weights_marginalize = np.zeros([self.numObjects,num_dirichlets],dtype=theano.config.floatX) #for i in range(self.numObjects): # weights_marginalize[i,i*self.numActions:(i+1)*self.numActions] = 1. #weights_margin = theano.shared( weights_marginalize , borrow=True) #self.output = T.dot( weights_margin, self.updated_belief) #calculating weight updates objects_idx = np.tile( np.arange(self.numObjects).reshape([-1, 1]), [1, self.numActions]).reshape([1, -1]) objects_idx = np.tile(objects_idx.reshape([-1, 1]), [1, batch_size]) # num_dirichlets x batch_size objects_idx = theano.shared(objects_idx.astype(theano.config.floatX), borrow=True) in_objects = T.tile(T.reshape(objects, [1, batch_size]), [num_dirichlets, 1]) # num_dirichlets x batch_size self.idx = self.eq_actions * T.eq( objects_idx, in_objects) # num_dirichlets x batch_size self.idx = self.idx.astype(theano.config.floatX) self.N = T.reshape(T.sum(self.idx, axis=1), [1, num_dirichlets]) #take care of 0 in the input to avoid nan in log term5 = T.dot(log_normed_beliefs, T.transpose(self.idx)) #dim x num_dirichlets self.update = self.N * T.reshape(T.psi(T.sum(self.alphas, axis=0)), [1, num_dirichlets]) - self.N * T.psi( self.alphas) + term5 #self.update = T.psi(self.alphas) + term5 #calculate log-prob of data ndirichlets ndirichlets dir_l_p = self.N * T.gammaln(T.sum( self.alphas, axis=0)) - self.N * T.sum( T.gammaln(self.alphas), axis=0) + T.sum( term5 * (self.alphas - 1.), axis=0) self.log_p_ao = T.mean(dir_l_p) self.params = [self.alphas] self.inputs_shape = inputs_shape #self.output_shape = [dim,batch_size] self.output_shape = [num_dirichlets, batch_size]
def __init__(self, num_actions): # remember parameters self.num_actions = num_actions # batch size is T_MAX now self.batch_size = T_MAX #BATCH_SIZE self.discount_rate = DISCOUNT_RATE self.history_length = HISTORY_LENGTH self.screen_dim = DIMS self.img_height = SCREEN_HEIGHT self.img_width = SCREEN_WIDTH self.beta = BETA self.learning_rate = LEARNING_RATE self.rms_decay = RMS_DECAY self.rms_epsilon = RMS_EPSILON # prepare tensors once and reuse them state = T.tensor4('state') reward = T.fvector('reward') advantage = T.fvector('advantage') action = T.ivector('action') #beta = T.fscalar('regularization_rate') # set learning rate #self.shared_beta = theano.shared(np.zeros((1)), dtype=theano.config.floatX , # broadcastable=(True)) #self.shared_beta.set_value([BETA]) # create shared theano variables self.state_shared = theano.shared( np.zeros((self.batch_size, self.history_length, self.img_height, self.img_width), dtype=theano.config.floatX)) self.reward_shared = theano.shared( np.zeros((self.batch_size), dtype=theano.config.floatX)) self.advantage_shared = theano.shared( np.zeros((self.batch_size), dtype=theano.config.floatX)) self.action_shared = theano.shared( np.zeros((self.batch_size), dtype='int32')) # can add multiple nets here # Shared network parameters here self.shared_net = self.build_shared_network() shared_out = lasagne.layers.get_output(self.shared_net, state) ####### OPTIMIZATION here -------------- # Policy network parameters here self.policy_network = self.build_policy_network() policy_out = lasagne.layers.get_output(self.policy_network, shared_out) # Value network parameters here self.value_network = self.build_value_network() value_out = lasagne.layers.get_output(self.value_network, shared_out) ## ----------------------- LOSS FUNCTION SHIT STARTS HERE ---------------------------------------- N = state.shape[0] # take log policy loss policy_loss = -T.log(policy_out[ T.arange(N), self.action_shared]) * self.advantage_shared # take entropy and add with the regularizer entropy = -T.sum(-policy_out * T.log(policy_out), axis=1) # add regullazrization policy_loss += self.beta * entropy #policy_loss = T.sum(policy_loss) # get the value loss value_loss = ( (self.reward_shared - T.reshape(value_out, (self.batch_size, )))**2) / 2 #value_loss = T.sum(value_loss) total_loss = T.sum(policy_loss + (0.5 * value_loss)) ## ----------------------- LOSS FUNCTION SHIT ENDS HERE ---------------------------------------- shared_params = lasagne.layers.helper.get_all_params(self.shared_net) only_policy_params = lasagne.layers.helper.get_all_params( self.policy_network) only_value_params = lasagne.layers.helper.get_all_params( self.value_network) policy_params = shared_params + only_policy_params value_params = shared_params + only_value_params g_time = time.time() logger.info("graph compiling") # get grads here policy_grad = T.grad(total_loss, policy_params) value_grad = T.grad(total_loss, value_params) # there'll be two kind of updates policy_updates = rmsprop_updates(policy_grad, policy_params, self.learning_rate, self.rms_decay, self.rms_epsilon) value_updates = rmsprop_updates(value_grad, value_params, self.learning_rate, self.rms_decay, self.rms_epsilon) givens = { state: self.state_shared, reward: self.reward_shared, action: self.action_shared, advantage: self.advantage_shared, } # theano functions for accumulating the grads self._policy_grad = theano.function([], policy_grad, givens=givens) self._value_grad = theano.function([], value_grad, givens=givens) # train will take input the grads and just apply them # NEEDS work here ------------ self._train_policy = theano.function([], [], updates=policy_updates, givens=givens) self._train_value = theano.function([], [], updates=value_updates, givens=givens) # get output for a state self._policy = theano.function([], policy_out, givens={state: self.state_shared}) self._value = theano.function([], value_out, givens={state: self.state_shared}) # need more theano functions for getting policy and value logger.info("Theano Graph Compiled !! %f", time.time() - g_time)
def get_detections(self, model, data_x, data_m, params): pr_threshold = params.get("prThreshold", 0.01) nms_threshold = params.get("nmsThreshold", 0.5) corner_threshold = params.get("cornerThreshold", self.sparse_layer.corner_threshold) corner_max = params.get("cornerMax", 1024) use_soft_nms = params.get("useSoftNMS", 0) == 1 t = (pr_threshold, nms_threshold, corner_threshold, corner_max) logging.verbose("Using detection params - pr threshold: %f, nms threshold: %f, corner_threshold: %f, corner_max: %i"%t) first_detect = False if self.detect_func is None: #get all model outputs outputs=[] if self.use_jointfit: det_fit = self.det_pr det_fit_null = det_fit[:, self.null_class, :, :] det_fit = det_fit[:,:self.class_num*self.fitness_num, :, :] det_fit = det_fit.reshape((self.batch_size, self.class_num, self.fitness_num, self.sample_num, self.sample_num)) det_fit_pr = tensor.exp(det_fit) m = tensor.max(det_fit, axis=2) det_pr = m + tensor.log(tensor.sum(tensor.exp(det_fit - m[:,:,None,:,:]), axis=2)) det_pr = tensor.concatenate([det_pr, det_fit_null[:,None,:,:]], axis=1) outputs.append(det_pr) val = [self.overlap_threshold[0] + i*(1.0 - self.overlap_threshold[0])/self.fitness_num for i in range(self.fitness_num)] fitness_val = theano.shared(numpy.array(val, dtype=numpy.float32)) fitness = tensor.log(tensor.sum(det_fit_pr*fitness_val[None,None,:,None,None], axis=2)) outputs.append(fitness) else: outputs.append(self.det_pr) if self.use_bbox_reg: outputs.append(self.bbox_predict) if self.use_indfit: outputs.append(tensor.exp(self.indfit_pr)) logging.info("Building detection function") self.detect_func = theano.function([model.input], outputs, givens=[(get_train(), tensor.cast(0, 'int8'))], on_unused_input='ignore') logging.verbose("Exporting graph...") with open("detect_graph.txt", "w") as f: theano.printing.debugprint(self.detect_func, file=f, print_type=True) first_detect = True #get sampling bounding boxs logging.verbose("Detecting sample bboxs (%.2f)"%corner_threshold) timer = common.Timer() sample_bboxs = self.sparse_layer.get_samples(data_x, train=False, store_shared=True) timer.mark() logging.verbose("Found sample bboxs: {}".format([len(bbox) for bbox in sample_bboxs])) #upload sampling bounding boxs bboxs = self.sparse_layer.set_samples(sample_bboxs) timer.mark() #classify sampling bounding boxs r = list(self.detect_func(data_x)) #get outputs if self.use_jointfit: det_pr = r[0] fitness = r[1] r_index = 2 else: det_pr = r[0] fitness = numpy.copy(det_pr) r_index = 1 if self.use_bbox_reg: bboxs = r[r_index] r_index += 1 else: bboxs = self.sparse_layer.get_bbox_array(sample_bboxs) if self.use_indfit: indfit_pr = r[r_index] fitness_val = numpy.array([0.0] + [self.overlap_threshold[0] + i * (1.0 - self.overlap_threshold[0])/(self.fitness_num-1) for i in range(self.fitness_num-1)]) fitness_exp = numpy.sum(indfit_pr*fitness_val[None,:,None,None], axis=1).astype(numpy.float32) fitness += numpy.log(fitness_exp)[:,None,:,:] r_index += 1 timer.mark() sample_bbox_num = [len(s) for s in sample_bboxs] detlists = c_code.build_detections_nms(pr_threshold, nms_threshold, use_soft_nms, det_pr, fitness, bboxs, sample_bbox_num) timer.mark() logging.verbose("Found detections:", [len(detlist) for detlist in detlists]) logging.verbose("FPS=%.1f, Timing (ms) - get samples: %i, upload: %i, classify: %i, build+nms %i"%tuple([self.batch_size / timer.current()] + timer.deltas_ms())) if not first_detect: global detect_time, detect_num detect_time += timer.current() detect_num += self.batch_size logging.info("Average FPS=%.1f"%(detect_num / detect_time)) #results format results=[] for i, detlist in enumerate(detlists): results.append({"detections":detlist, "meta":data_m[i]}) return results
def get_errors(self, yt_index, yt_value): #unpack indexs and values shapes = [self.det_shape] if self.use_bbox_reg: shapes += [(self.batch_size, self.sample_num, self.sample_num), (self.batch_size, 8, self.sample_num, self.sample_num)] if self.use_indfit: shapes += [self.indfit_shape] v = common.ndarray_unpack(yt_value, shapes) det_pr = v[0] index = 1 if self.use_bbox_reg: bbox_valid, bbox_reg = v[index:(index+2)] index += 2 if self.use_indfit: indfit_pr = v[index:(index+1)] #Detection Cost: det_errors = -tensor.sum(det_pr*self.det_pr, axis=1) / math.log(self.det_shape[1]) #Bounding Box Regression Cost: bbox_errors = None if self.use_bbox_reg and self.bbox_factor > 0.0: bbox_target = bbox_reg[:,0:4,...] bbox_sample = bbox_reg[:,4:8,...] bbox_errors = tensor.zeros((self.batch_size, self.sample_num, self.sample_num), dtype=numpy.float32) if self.use_bounded_iou: target_x = bbox_target[:,0,:,:] target_y = bbox_target[:,1,:,:] target_w = bbox_target[:,2,:,:] target_h = bbox_target[:,3,:,:] predict_x = 0.5*(self.bbox_predict[:,:,:,0] + self.bbox_predict[:,:,:,2]) predict_y = 0.5*(self.bbox_predict[:,:,:,1] + self.bbox_predict[:,:,:,3]) predict_w = self.bbox_predict[:,:,:,2] - self.bbox_predict[:,:,:,0] predict_h = self.bbox_predict[:,:,:,3] - self.bbox_predict[:,:,:,1] dx = target_x - predict_x dy = target_y - predict_y eps = 0.001 #ORIGINAL Paper used 4*dx, proper implementation is 2*dx cost_x = tensor.switch(dx >= 0.0, 2*dx / (target_w + dx + eps), -2*dx / (target_w - dx + eps)) cost_y = tensor.switch(dy >= 0.0, 2*dy / (target_h + dy + eps), -2*dy / (target_h - dy + eps)) cost_w = 1.0 - tensor.minimum(target_w / (predict_w + eps), predict_w / (target_w + eps)) cost_h = 1.0 - tensor.minimum(target_h / (predict_h + eps), predict_h / (target_h + eps)) cost = tensor.concatenate([cost_x[:,None,:,:], cost_y[:,None,:,:], cost_w[:,None,:,:], cost_h[:,None,:,:]], axis=1) bbox_errors += self.bbox_factor*bbox_valid*tensor.sum(theano_util.smooth_L1(cost), axis=1) else: #standard Fast R-CNN style cost tx = (bbox_target[:, 0, ...] - bbox_sample[:, 0, ...]) / bbox_sample[:, 2, ...] ty = (bbox_target[:, 1, ...] - bbox_sample[:, 1, ...]) / bbox_sample[:, 3, ...] tw = tensor.log(bbox_target[:, 2, ...] / bbox_sample[:, 2, ...]) th = tensor.log(bbox_target[:, 3, ...] / bbox_sample[:, 3, ...]) t = tensor.concatenate([tx[:,None, ...], ty[:,None, ...], tw[:,None, ...], th[:,None, ...]], axis=1) dt = t - self.bbox_reg bbox_errors += self.bbox_factor*bbox_valid*tensor.sum(theano_util.smooth_L1(dt), axis=1) indfit_errors = None if self.use_indfit: indfit_errors = -tensor.sum(indfit_pr*self.indfit_pr, axis=1) / math.log(self.fitness_num) return det_errors, bbox_errors, indfit_errors
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) cqembed = tensor.concatenate([cembed, tensor.extra_ops.repeat(qenc[None, :, :], cembed.shape[0], axis=0)], axis=2) clstms, chidden_list = make_bidir_lstm_stack(cqembed, config.embed_size + qenc_dim, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2*sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2*config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP start attention_mlp_start = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp_start') attention_qlinear_start = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq_start') #Wum attention_clinear_start = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc_start') # Wym bricks += [attention_mlp_start, attention_qlinear_start, attention_clinear_start] layer1_start = Tanh(name='layer1_start') layer1_start = layer1_start.apply(attention_clinear_start.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))) .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0])) + attention_qlinear_start.apply(qenc)[None, :, :]) att_weights_start = attention_mlp_start.apply(layer1_start.reshape((layer1_start.shape[0]*layer1_start.shape[1], layer1_start.shape[2]))) att_weights_start = att_weights_start.reshape((layer1_start.shape[0], layer1_start.shape[1])) att_weights_start = tensor.nnet.softmax(att_weights_start.T).T attended = tensor.sum(cenc * att_weights_start[:, :, None], axis=0) attended.name = 'attended' # Attention mechanism MLP end attention_mlp_end = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp_end') attention_qlinear_end = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq_end') #Wum attention_clinear_end = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc_end') # Wym bricks += [attention_mlp_end, attention_qlinear_end, attention_clinear_end] layer1_end = Tanh(name='layer1_end') layer1_end = layer1_end.apply(attention_clinear_end.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))) .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0])) + attention_qlinear_end.apply(attended)[None, :, :]) att_weights_end = attention_mlp_end.apply(layer1_end.reshape((layer1_end.shape[0]*layer1_end.shape[1], layer1_end.shape[2]))) att_weights_end = att_weights_end.reshape((layer1_end.shape[0], layer1_end.shape[1])) att_weights_end = tensor.nnet.softmax(att_weights_end.T).T att_weights_start = tensor.dot(tensor.le(tensor.tile(theano.tensor.arange(context.shape[0])[None,:], (context.shape[0], 1)), tensor.tile(theano.tensor.arange(context.shape[0])[:,None], (1, context.shape[0]))), att_weights_start) att_weights_end = tensor.dot(tensor.ge(tensor.tile(theano.tensor.arange(context.shape[0])[None,:], (context.shape[0], 1)), tensor.tile(theano.tensor.arange(context.shape[0])[:,None], (1, context.shape[0]))), att_weights_end) # add attention from left and right #att_weights = att_weights_start * att_weights_end att_weights = tensor.minimum(att_weights_start, att_weights_end) att_target = tensor.eq(tensor.tile(answer[None,:,:], (context.shape[0], 1, 1)), tensor.tile(context[:,None,:], (1, answer.shape[0], 1))).sum(axis=1).clip(0,1) self.predictions = tensor.gt(att_weights, 0.5) * context att_target = att_target / (att_target.sum(axis=0) + 0.00001) att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001) cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' att_weights_start.name = 'att_weights_start' att_weights_end.name = 'att_weights_end' att_target.name = 'att_target' att_weights.name = 'att_weights' self.predictions.name = 'pred' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] self.analyse_vars= [cost, self.predictions, att_weights_start, att_weights_end, att_weights, att_target] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def test_batch_normalization_train_broadcast(): for axes in ('per-activation', 'spatial', (1, 2, 3, 4)): for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector): x = vartype('x') ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used running_average_factor = 0.3 # remove non-existing axes if isinstance(axes, tuple): axes = tuple(i for i in axes if i < ndim) if len(axes) == 0: continue # convert axes to explicit list if axes == 'per-activation': axes2 = (0, ) elif axes == 'spatial': axes2 = (0, ) + tuple(range(2, ndim)) else: axes2 = axes # compute axes for parameter tensors non_bc_axes = tuple(i for i in range(ndim) if i not in axes2) params_dimshuffle = ['x'] * ndim for i, axis in enumerate(non_bc_axes): params_dimshuffle[axis] = i # construct non-broadcasted parameter variables param_type = T.TensorType(x.dtype, (False, ) * len(non_bc_axes)) scale, bias, running_mean, running_var = (param_type(n) for n in ('scale', 'bias', 'running_mean', 'running_var')) # broadcast parameter variables scale_bc = scale.dimshuffle(params_dimshuffle) bias_bc = bias.dimshuffle(params_dimshuffle) running_mean_bc = running_mean.dimshuffle(params_dimshuffle) running_var_bc = running_var.dimshuffle(params_dimshuffle) # batch_normalization_train with original, non-broadcasted variables train_non_bc = \ bn.batch_normalization_train( x, scale, bias, axes, eps, running_average_factor, running_mean, running_var) # batch_normalization_train with broadcasted variables train_bc = \ bn.batch_normalization_train( x, scale_bc, bias_bc, axes, eps, running_average_factor, running_mean_bc, running_var_bc) train_bc = tuple([train_bc[0]] + # out [r.dimshuffle(non_bc_axes) for r in train_bc[1:]]) # batch_normalization_test with original, non-broadcasted variables test_non_bc = \ bn.batch_normalization_test( x, scale, bias, running_mean, running_var, axes, eps) # batch_normalization_test with broadcasted variables test_bc = \ bn.batch_normalization_test( x, scale_bc, bias_bc, running_mean_bc, running_var_bc, axes, eps) # subtract the results of the non-broadcasted and broadcasted calls results_non_bc = train_non_bc + (test_non_bc, ) results_bc = train_bc + (test_bc, ) results = [ abs(r - r_bc) for (r, r_bc) in zip(results_non_bc, results_bc) ] # compile to compute all differences f = theano.function([x, scale, bias, running_mean, running_var], T.sum(sum(results))) # the paired ops are exactly the same, so the optimizer should have # collapsed the sum of differences to a constant zero nodes = f.maker.fgraph.toposort() if theano.config.mode != "FAST_COMPILE": assert len(nodes) == 1 assert isinstance(nodes[0].op, theano.compile.DeepCopyOp) inputs = [ np.asarray(np.random.rand(*((4, ) * n)), x.dtype) for n in [ x.ndim, scale.ndim, bias.ndim, running_mean.ndim, running_var.ndim ] ] assert 0.0 == f(*inputs)
def hybrid_loss(y, t): log_loss = categorical_crossentropy(y, t).mean() kappa_loss = quad_kappa_loss(y, t, y_pow=2) return kappa_loss + 0.5 * T.clip(log_loss, 0.6, 10**3) def discrete_predict(predictions): return T.round(T.clip(predictions, 0, 4)) predictions = nn.layers.get_output(output_layer, deterministic=False) train_log_loss, train_reg_loss, train_multi_loss = multi_task_loss( predictions, y) params = nn.layers.get_all_params(output_layer, regularizable=True) regularization = sum(T.sum(p**2) for p in params) train_loss = train_multi_loss + l2_reg * regularization train_accuracy = accuracy(predictions[:, :num_class], y) train_kappa = quad_kappa(predictions[:, :num_class], y) valid_predictions = nn.layers.get_output(output_layer, deterministic=True) valid_log_loss, valid_reg_loss, valid_multi_loss = multi_task_loss( valid_predictions, y) valid_accuracy = accuracy(valid_predictions[:, :num_class], y) valid_kappa = quad_kappa(valid_predictions[:, :num_class], y) # Scale grads all_params = nn.layers.get_all_params(output_layer, trainable=True) all_grads = T.grad(train_loss, all_params) #scaled_grads = nn.updates.total_norm_constraint(all_grads, max_norm=5, return_norm=False)
def GESD (sum_uni_l, sum_uni_r): eucli=1/(1+T.sum((sum_uni_l-sum_uni_r)**2)) kernel=1/(1+T.exp(-(T.dot(sum_uni_l,sum_uni_r.T)+1))) return (eucli*kernel).reshape((1,1))
#theano################################################################################################################# #check forward fo = theano.function([X, W_x, W_h, B, hid], o) print"x=", v_x print"w_x=",v_w_x print"w_h=", v_w_h print"b=", v_b print"b_mkl=", v_b_mkl print"hid=",v_hid print"o_real=",v_o_real v_o = fo(v_x, v_w_x, v_w_h, v_b, v_hid) #print "forward o=",v_o #check gradients loss = -T.sum(o * T.log(o_real)) gx = T.grad(loss, X) fx = theano.function([X, W_x, W_h, B, hid, o_real], gx) #theano.printing.pydotprint(fx, outfile='rnn_dx.png', var_with_name_simple=True) gradients_x = fx(v_x, v_w_x, v_w_h, v_b, v_hid, v_o_real) print "gradients_x=", gradients_x #mkl################################################################################################################# O = mkl_simplernn_bw_op.SimpleRNN_bw()(X, W_x, W_h, B_mkl, hid, o_real) fx_mkl = theano.function([X, W_x, W_h, B_mkl, hid, o_real], O) gradients_x_mkl = fx_mkl(v_x, v_w_x, v_w_h, v_b_mkl, v_hid, v_o_real) theano.printing.pydotprint(fx_mkl, outfile='rnn_dx.png', var_with_name_simple=True) print "gradients_x_mkl=", gradients_x_mkl
def evaluate_lenet5(learning_rate=0.008, n_epochs=2000, nkerns=[400], batch_size=1, window_width=3, maxSentLength=30, emb_size=300, hidden_size=[300,10], margin=0.5, L2_weight=0.0001, Div_reg=0.0001, norm_threshold=5.0, use_svm=False): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/'; rng = numpy.random.RandomState(23455) datasets, word2id=load_msr_corpus_20161229(rootPath+'tokenized_train.txt', rootPath+'tokenized_test.txt', maxSentLength) vocab_size=len(word2id)+1 mtPath='/mounts/data/proj/wenpeng/Dataset/paraphraseMT/' mt_train, mt_test=load_mts(mtPath+'concate_15mt_train.txt', mtPath+'concate_15mt_test.txt') wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_number_matching_scores.txt', rootPath+'test_number_matching_scores.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0] indices_train_l=indices_train[::2] indices_train_r=indices_train[1::2] trainLengths_l=trainLengths[::2] trainLengths_r=trainLengths[1::2] normalized_train_length_l=normalized_train_length[::2] normalized_train_length_r=normalized_train_length[1::2] trainLeftPad_l=trainLeftPad[::2] trainLeftPad_r=trainLeftPad[1::2] trainRightPad_l=trainRightPad[::2] trainRightPad_r=trainRightPad[1::2] indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1] indices_test_l=indices_test[::2] indices_test_r=indices_test[1::2] testLengths_l=testLengths[::2] testLengths_r=testLengths[1::2] normalized_test_length_l=normalized_test_length[::2] normalized_test_length_r=normalized_test_length[1::2] testLeftPad_l=testLeftPad[::2] testLeftPad_r=testLeftPad[1::2] testRightPad_l=testRightPad[::2] testRightPad_r=testRightPad[1::2] train_size = len(indices_train_l) test_size = len(indices_test_l) train_batch_start=range(train_size) test_batch_start=range(test_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int32') # indices_train_r=T.cast(indices_train_r, 'int32') # indices_test_l=T.cast(indices_test_l, 'int32') # indices_test_r=T.cast(indices_test_r, 'int32') rand_values=random_value_normal((vocab_size, emb_size), theano.config.floatX, rng) # rand_values[0]=numpy.array(numpy.zeros(emb_size)) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_word2vec() rand_values=load_word2vec_to_init_new(rand_values, id2word, word2vec) embeddings=theano.shared(value=numpy.array(rand_values,dtype=theano.config.floatX), borrow=True)#theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.iscalar() x_index_l = T.imatrix() # now, x is the index matrix, must be integer x_index_r = T.imatrix() y = T.ivector() left_l=T.iscalar() right_l=T.iscalar() left_r=T.iscalar() right_r=T.iscalar() length_l=T.iscalar() length_r=T.iscalar() norm_length_l=T.fscalar() norm_length_r=T.fscalar() mts=T.fmatrix() wmf=T.fmatrix() # cost_tmp=T.fscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size=(emb_size,window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1) layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3])) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0_l = Conv_with_input_para(rng, input=layer0_l_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_r = Conv_with_input_para(rng, input=layer0_r_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output') layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output') layer0_l_output_maxpool = T.max(layer0_l.output_narrow_conv_out[:,:,:,left_l:], axis=3).reshape((1, nkerns[0])) layer0_r_output_maxpool = T.max(layer0_r.output_narrow_conv_out[:,:,:,left_r:], axis=3).reshape((1, nkerns[0])) layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1, dim=maxSentLength+filter_size[1]-1) sum_uni_l=T.sum(layer0_l_input[:,:,:,left_l:], axis=3).reshape((1, emb_size)) norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) sum_uni_r=T.sum(layer0_r_input[:,:,:,left_r:], axis=3).reshape((1, emb_size)) norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) uni_cosine=cosine(sum_uni_l, sum_uni_r) ''' linear=Linear(sum_uni_l, sum_uni_r) poly=Poly(sum_uni_l, sum_uni_r) sigmoid=Sigmoid(sum_uni_l, sum_uni_r) rbf=RBF(sum_uni_l, sum_uni_r) gesd=GESD(sum_uni_l, sum_uni_r) ''' eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2% #eucli_1=EUCLID(sum_uni_l, sum_uni_r) len_l=norm_length_l.reshape((1,1)) len_r=norm_length_r.reshape((1,1)) ''' len_l=length_l.reshape((1,1)) len_r=length_r.reshape((1,1)) ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts HL_layer_1_input=T.concatenate([ # mts, eucli_1, #uni_cosine,norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, # uni_cosine, # sum_uni_l, # sum_uni_r, # sum_uni_l+sum_uni_r, 1.0/(1.0+EUCLID(layer0_l_output_maxpool, layer0_r_output_maxpool)), cosine(layer0_l_output_maxpool, layer0_r_output_maxpool), layer0_l_output_maxpool, layer0_r_output_maxpool, T.sqrt((layer0_l_output_maxpool-layer0_r_output_maxpool)**2+1e-10), layer1.output_eucli_to_simi, #layer1.output_cosine,layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, # layer1.output_cosine, layer1.output_vector_l, layer1.output_vector_r, T.sqrt((layer1.output_vector_l-layer1.output_vector_r)**2+1e-10), # len_l, len_r layer1.output_attentions # wmf, ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) HL_layer_1_input_with_extra=T.concatenate([#HL_layer_1_input, mts, len_l, len_r # wmf ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) HL_layer_1_input_size=1+1+ 1+1+3* nkerns[0] +1+1+3*nkerns[0]+10*10 HL_layer_1_input_with_extra_size = HL_layer_1_input_size+15+2 HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[0], activation=T.tanh) HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[0], n_out=hidden_size[1], activation=T.tanh) LR_layer_input=T.concatenate([HL_layer_2.output, HL_layer_1.output, HL_layer_1_input],axis=1) LR_layer_input_with_extra=T.concatenate([HL_layer_2.output, HL_layer_1_input_with_extra],axis=1)#HL_layer_1.output, LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=HL_layer_1_input_size+hidden_size[0]+hidden_size[1], n_out=2) # LR_layer_input=HL_layer_2.output # LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=hidden_size, n_out=2) # layer3=LogisticRegression(rng, input=layer3_input, n_in=15+1+1+2+3, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((LR_layer.W** 2).sum()+(HL_layer_2.W** 2).sum()+(HL_layer_1.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum() # diversify_reg= Diversify_Reg(LR_layer.W.T)+Diversify_Reg(HL_layer_2.W.T)+Diversify_Reg(HL_layer_1.W.T)+Diversify_Reg(conv_W_into_matrix) cost_this =debug_print(LR_layer.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg cost=cost_this+L2_weight*L2_reg#+Div_reg*diversify_reg test_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r, mts,wmf], [LR_layer.errors(y), LR_layer.y_pred, LR_layer_input_with_extra, y], on_unused_input='ignore',allow_input_downcast=True) params = LR_layer.params+ HL_layer_2.params+HL_layer_1.params+[conv_W, conv_b]+[embeddings]#+[embeddings]# + layer1.params accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): clipped_grad = T.clip(grad_i, -0.5, 0.5) acc = acc_i + T.sqr(clipped_grad) updates.append((param_i, param_i - learning_rate * clipped_grad / T.sqrt(acc+1e-10))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r, mts,wmf], [cost,LR_layer.errors(y)], updates=updates, on_unused_input='ignore',allow_input_downcast=True) train_model_predict = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r, mts,wmf], [cost_this,LR_layer.errors(y), LR_layer_input_with_extra, y],on_unused_input='ignore',allow_input_downcast=True) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is best_params = None best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() epoch = 0 done_looping = False max_acc=0.0 nn_max_acc=0.0 best_iter=0 cost_tmp=0.0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 shuffle(train_batch_start)#shuffle training data for index in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * train_size + minibatch_index +1 minibatch_index=minibatch_index+1 # if iter%update_freq != 0: # cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) # #print 'cost_ij: ', cost_ij # cost_tmp+=cost_ij # error_sum+=error_ij # else: cost_i, error_i= train_model(indices_train_l[index: index + batch_size], indices_train_r[index: index + batch_size], trainY[index: index + batch_size], trainLeftPad_l[index], trainRightPad_l[index], trainLeftPad_r[index], trainRightPad_r[index], trainLengths_l[index], trainLengths_r[index], normalized_train_length_l[index], normalized_train_length_r[index], mt_train[index: index + batch_size], wm_train[index: index + batch_size]) cost_tmp+=cost_i if iter < 6000 and iter %100 ==0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter) if iter >= 6000 and iter % 100 == 0: # if iter%100 ==0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter) test_losses=[] test_y=[] test_features=[] for index in test_batch_start: test_loss, pred_y, layer3_input, y=test_model(indices_test_l[index: index + batch_size], indices_test_r[index: index + batch_size], testY[index: index + batch_size], testLeftPad_l[index], testRightPad_l[index], testLeftPad_r[index], testRightPad_r[index], testLengths_l[index], testLengths_r[index], normalized_test_length_l[index], normalized_test_length_r[index], mt_test[index: index + batch_size], wm_test[index: index + batch_size]) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) test_y.append(y[0]) test_features.append(layer3_input[0]) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() test_score = numpy.mean(test_losses) test_acc = (1-test_score) * 100. if test_acc > nn_max_acc: nn_max_acc = test_acc print '\t\t\tepoch:', epoch, 'iter:', iter, 'current acc:', test_acc, 'nn_max_acc:', nn_max_acc #now, see the results of svm if use_svm: train_y=[] train_features=[] for index in train_batch_start: cost_ij, error_ij, layer3_input, y=train_model_predict(indices_train_l[index: index + batch_size], indices_train_r[index: index + batch_size], trainY[index: index + batch_size], trainLeftPad_l[index], trainRightPad_l[index], trainLeftPad_r[index], trainRightPad_r[index], trainLengths_l[index], trainLengths_r[index], normalized_train_length_l[index], normalized_train_length_r[index], mt_train[index: index + batch_size], wm_train[index: index + batch_size]) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(' '.join(map(str,layer3_input[0]))+'\n') #write_feature.close() clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33 clf.fit(train_features, train_y) results=clf.predict(test_features) lr=LinearRegression().fit(train_features, train_y) results_lr=lr.predict(test_features) corr_count=0 corr_lr=0 test_size=len(test_y) for i in range(test_size): if results[i]==test_y[i]: corr_count+=1 if numpy.absolute(results_lr[i]-test_y[i])<0.5: corr_lr+=1 acc=corr_count*1.0/test_size acc_lr=corr_lr*1.0/test_size if acc > max_acc: max_acc=acc best_iter=iter if acc_lr> max_acc: max_acc=acc_lr best_iter=iter print '\t\t\t\tsvm acc: ', acc, 'LR acc: ', acc_lr, ' max acc: ', max_acc , ' at iter: ', best_iter if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def log_softmax(x): x_diff = x - x.max(1, keepdims=True) return x_diff - T.log(T.sum(T.exp(x_diff), axis=1, keepdims=True))
def __init__(self, **option): # source and target embedding dim sedim, tedim = option["embdim"] # source, target and attention hidden dim shdim, thdim, ahdim, domaindim, feadim = option["hidden"] # maxout hidden dim maxdim = option["maxhid"] # maxout part maxpart = option["maxpart"] # deepout hidden dim deephid = option["deephid"] svocab, tvocab = option["vocabulary"] sw2id, sid2w = svocab tw2id, tid2w = tvocab # source and target vocabulary size svsize, tvsize = len(sid2w), len(tid2w) dnum = option['dnum'] if "scope" not in option or option["scope"] is None: option["scope"] = "rnnsearch" if "initializer" not in option: option["initializer"] = None if "regularizer" not in option: option["regularizer"] = None if "keep_prob" not in option: option["keep_prob"] = 1.0 dtype = theano.config.floatX initializer = option["initializer"] regularizer = option["regularizer"] keep_prob = option["keep_prob"] or 1.0 scope = option["scope"] decoder_scope = "decoder" encoder = Encoder(sedim, shdim) decoderType = eval("Decoder{}".format(option["decoder"])) decoder = decoderType(tedim, thdim, ahdim, 2 * shdim, dnum=dnum, dim_maxout=maxdim, max_part=maxpart, dim_readout=deephid, dim_domain=domaindim, feadim=feadim, n_y_vocab=tvsize) # training graph with ops.variable_scope(scope, initializer=initializer, regularizer=regularizer, dtype=dtype): src_seq = T.imatrix("source_sequence") src_mask = T.matrix("source_sequence_mask") tgt_seq = T.imatrix("target_sequence") tgt_mask = T.matrix("target_sequence_mask") tag_seq = T.imatrix("domain_tag") # nsrc_mask = T.set_subtensor(src_mask[T.cast(T.sum(src_mask, 0) - 1, 'int32'), # T.arange(src_mask.shape[1])], 0.0) with ops.variable_scope("source_embedding"): source_embedding = ops.get_variable("embedding", [svsize, sedim]) source_bias = ops.get_variable("bias", [sedim]) with ops.variable_scope("target_embedding") as tgtembscope: target_embedding = ops.get_variable("embedding", [tvsize, tedim]) # target_bias = ops.get_variable("bias", [tedim]) decoder.tiescope = tgtembscope source_inputs = nn.embedding_lookup(source_embedding, src_seq) target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) source_inputs = source_inputs + source_bias if keep_prob < 1.0: source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob) target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob) states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) with ops.variable_scope("Specific"): domain_alpha = domain_sensitive_attention( annotation, src_mask, shdim * 2, domaindim) # domain_alpha = attention(r_states[0], annotation, nsrc_mask, # shdim, # shdim * 2) domain_context = T.sum(annotation * domain_alpha[:, :, None], 0) dfeature = nn.feedforward(domain_context, [shdim * 2, feadim], True, activation=T.tanh, scope="feature1") dscores = nn.feedforward(dfeature, [feadim, dnum], True, activation=T.tanh, scope="score") # (batch, 2) dprobs = T.nnet.softmax(dscores) dpred_tag = T.argmax(dprobs, 1) didx = T.arange(tag_seq.flatten().shape[0]) dce = -T.log(dprobs[didx, tag_seq.flatten()]) dcost = T.mean(dce) share_alpha = domain_sensitive_attention(annotation, src_mask, shdim * 2, domaindim) # share_alpha = attention(r_states[0], annotation, nsrc_mask, # shdim, # shdim * 2) share_context = T.sum(annotation * share_alpha[:, :, None], 0) sfeature = nn.feedforward(share_context, [shdim * 2, feadim], True, activation=T.tanh, scope="feature1") with ops.variable_scope("Shared"): sscores = nn.feedforward(sfeature, [feadim, dnum], True, activation=T.tanh, scope="score") # (batch, 2) sprobs = T.nnet.softmax(sscores) spred_tag = T.argmax(sprobs, 1) sidx = T.arange(tag_seq.flatten().shape[0]) sce = -T.log(sprobs[sidx, tag_seq.flatten()]) scost = T.mean(sce) adv_sce = -sprobs[sidx, tag_seq.flatten()] * T.log( sprobs[sidx, tag_seq.flatten()]) adv_scost = T.mean(adv_sce) domain_gate = nn.feedforward([dfeature, annotation], [[feadim, shdim * 2], shdim * 2], True, scope="domain_gate") domain_annotation = annotation * domain_gate domain_annotation = nn.dropout(domain_annotation, keep_prob=keep_prob) share_gate = nn.feedforward([sfeature, annotation], [[feadim, shdim * 2], shdim * 2], True, scope="share_gate") annotation = annotation * share_gate annotation = nn.dropout(annotation, keep_prob=keep_prob) # compute initial state for decoder # first state of backward encoder # batch * shdim final_state = T.concatenate([ annotation[0, :, annotation.shape[-1] / 2:], domain_annotation[0, :, annotation.shape[-1] / 2:] ], -1) with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim * 2, thdim], True, scope="initial", activation=T.tanh) # keys for query mapped_keys = map_key(annotation, 2 * shdim, ahdim, "semantic") mapped_domain_keys = map_key(domain_annotation, 2 * shdim, ahdim, "domain") _, _, cost, tgtdcost, tpred_tag, _ = decoder.forward( tgt_seq, target_inputs, tgt_mask, mapped_keys, src_mask, annotation, initial_state, mapped_domain_keys, domain_annotation, tag_seq, keep_prob) lamb = theano.shared(numpy.asarray(option["lambda"], dtype), "lambda") # cwscost *= lamb final_cost = cost + dcost + tgtdcost - lamb * adv_scost tag_inputs = [src_seq, src_mask] tag_outputs = [dpred_tag, spred_tag] tag_predict = theano.function(tag_inputs, tag_outputs) self.tag_predict = tag_predict tgt_tag_inputs = [src_seq, src_mask, tgt_seq, tgt_mask] tgt_tag_outputs = [tpred_tag] tgt_tag_predict = theano.function(tgt_tag_inputs, tgt_tag_outputs) self.tgt_tag_predict = tgt_tag_predict training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask, tag_seq] training_outputs = [cost, dcost, adv_scost, tgtdcost] self.cost_cla = scost self.inputs_cla = [src_seq, src_mask, tag_seq] self.outputs_cla = [scost] # decoding graph with ops.variable_scope(scope, reuse=True): prev_words = T.ivector("prev_words") # disable dropout source_inputs = nn.embedding_lookup(source_embedding, src_seq) source_inputs = source_inputs + source_bias states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) with ops.variable_scope("Specific"): domain_alpha = domain_sensitive_attention( annotation, src_mask, shdim * 2, domaindim) # domain_alpha = attention(r_states[0], annotation, nsrc_mask, # shdim, # shdim * 2) domain_context = T.sum(annotation * domain_alpha[:, :, None], 0) dfeature = nn.feedforward(domain_context, [shdim * 2, feadim], True, activation=T.tanh, scope="feature1") share_alpha = domain_sensitive_attention(annotation, src_mask, shdim * 2, domaindim) # share_alpha = attention(r_states[0], annotation, nsrc_mask, # shdim, # shdim * 2) share_context = T.sum(annotation * share_alpha[:, :, None], 0) sfeature = nn.feedforward(share_context, [shdim * 2, feadim], True, activation=T.tanh, scope="feature1") domain_gate = nn.feedforward([dfeature, annotation], [[feadim, shdim * 2], shdim * 2], True, scope="domain_gate") domain_annotation = annotation * domain_gate share_gate = nn.feedforward([sfeature, annotation], [[feadim, shdim * 2], shdim * 2], True, scope="share_gate") annotation = annotation * share_gate # decoder final_state = T.concatenate([ annotation[0, :, annotation.shape[-1] / 2:], domain_annotation[0, :, annotation.shape[-1] / 2:] ], -1) with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim * 2, thdim], True, scope="initial", activation=T.tanh) mapped_keys = map_key(annotation, 2 * shdim, ahdim, "semantic") mapped_domain_keys = map_key(domain_annotation, 2 * shdim, ahdim, "domain") prev_inputs = nn.embedding_lookup(target_embedding, prev_words) # prev_inputs = prev_inputs + target_bias cond = T.neq(prev_words, 0) # zeros out embedding if y is 0, which indicates <s> prev_inputs = prev_inputs * cond[:, None] with ops.variable_scope(decoder_scope): mask = T.ones_like(prev_words, dtype=dtype) next_state, context = decoder.step(prev_inputs, mask, initial_state, mapped_keys, annotation, src_mask, mapped_domain_keys, domain_annotation) if option["decoder"] == "GruSimple": probs = decoder.prediction(prev_inputs, initial_state, context) elif option["decoder"] == "GruCond": probs = decoder.prediction(prev_inputs, next_state, context) # encoding encoding_inputs = [src_seq, src_mask] encoding_outputs = [ annotation, initial_state, mapped_keys, mapped_domain_keys, domain_annotation ] encode = theano.function(encoding_inputs, encoding_outputs) if option["decoder"] == "GruSimple": prediction_inputs = [ prev_words, initial_state, annotation, mapped_keys, src_mask ] prediction_outputs = [probs, context] predict = theano.function(prediction_inputs, prediction_outputs) generation_inputs = [prev_words, initial_state, context] generation_outputs = next_state generate = theano.function(generation_inputs, generation_outputs) self.predict = predict self.generate = generate elif option["decoder"] == "GruCond": prediction_inputs = [ prev_words, initial_state, annotation, mapped_keys, src_mask, mapped_domain_keys, domain_annotation ] prediction_outputs = [probs, next_state] predict = theano.function(prediction_inputs, prediction_outputs) self.predict = predict self.cost = final_cost self.inputs = training_inputs self.outputs = training_outputs self.updates = [] # self.align = align # self.sample = sample self.encode = encode # self.get_snt_cost = get_snt_cost self.option = option
def RBF(sum_uni_l, sum_uni_r): eucli=T.sum((sum_uni_l-sum_uni_r)**2) return T.exp(-0.5*eucli).reshape((1,1))
def get_error(self, input): L = 0.5 * T.sum(T.sqr(self.a_test - input), axis=1) return T.mean(L)
def training(self, fea2obj, batch_size, learning_rate=0.005, steprule='adagrad', wait_epochs=5, kl_weight_init=None, klw_ep=50, klw_inc_rate=0, num_epochs=None): networkfile = self._config['net'] n_epochs = num_epochs or int(self._config['nepochs']) reg_weight=float(self._config['loss_weight']) reg_type=self._config['loss_reg'] numtrain = int(self._config['num_train']) if 'num_train' in self._config else None train_stream, num_samples_train = get_comb_stream(fea2obj, 'train', batch_size, shuffle=True, num_examples=numtrain) dev_stream, num_samples_dev = get_comb_stream(fea2obj, 'dev', batch_size=None, shuffle=False) logger.info('sources: %s -- number of train/dev samples: %d/%d', train_stream.sources, num_samples_train, num_samples_dev) t2idx = fea2obj['targets'].t2idx klw_init = kl_weight_init or float(self._config['kld_weight']) if 'kld_weight' in self._config else 1 logger.info('kl_weight_init: %d', klw_init) kl_weight = shared_floatx(klw_init, 'kl_weight') entropy_weight = shared_floatx(1., 'entropy_weight') cost, p_at_1, _, KLD, logpy_xz, pat1_recog, misclassify_rate= build_model_new(fea2obj, len(t2idx), self._config, kl_weight, entropy_weight) cg = ComputationGraph(cost) weights = VariableFilter(roles=[WEIGHT])(cg.parameters) logger.info('Model weights are: %s', weights) if 'L2' in reg_type: cost += reg_weight * l2_norm(weights) logger.info('applying %s with weight: %f ', reg_type, reg_weight) dropout = -0.1 if dropout > 0: cg = apply_dropout(cg, weights, dropout) cost = cg.outputs[0] cost.name = 'cost' logger.info('Our Algorithm is : %s, and learning_rate: %f', steprule, learning_rate) if 'adagrad' in steprule: cnf_step_rule = AdaGrad(learning_rate) elif 'adadelta' in steprule: cnf_step_rule = AdaDelta(decay_rate=0.95) elif 'decay' in steprule: cnf_step_rule = RMSProp(learning_rate=learning_rate, decay_rate=0.90) cnf_step_rule = CompositeRule([cnf_step_rule, StepClipping(1)]) elif 'momentum' in steprule: cnf_step_rule = Momentum(learning_rate=learning_rate, momentum=0.9) elif 'adam' in steprule: cnf_step_rule = Adam(learning_rate=learning_rate) else: logger.info('The steprule param is wrong! which is: %s', steprule) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=cnf_step_rule, on_unused_sources='warn') #algorithm.add_updates(updates) gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [cost, gradient_norm, step_norm, p_at_1, KLD, logpy_xz, kl_weight, pat1_recog] train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True, before_first_epoch=True, prefix='tra') dev_monitor = DataStreamMonitoring(variables=[cost, p_at_1, KLD, logpy_xz, pat1_recog, misclassify_rate], after_epoch=True, before_first_epoch=True, data_stream=dev_stream, prefix="dev") extensions = [dev_monitor, train_monitor, Timing(), TrackTheBest('dev_cost'), FinishIfNoImprovementAfter('dev_cost_best_so_far', epochs=wait_epochs), Printing(after_batch=False), #, ProgressBar() FinishAfter(after_n_epochs=n_epochs), saveload.Load(networkfile+'.toload.pkl'), ] + track_best('dev_cost', networkfile+ '.best.pkl') #extensions.append(SharedVariableModifier(kl_weight, # lambda n, klw: numpy.cast[theano.config.floatX] (klw_inc_rate + klw), after_epoch=False, every_n_epochs=klw_ep, after_batch=False)) # extensions.append(SharedVariableModifier(entropy_weight, # lambda n, crw: numpy.cast[theano.config.floatX](crw - klw_inc_rate), after_epoch=False, every_n_epochs=klw_ep, after_batch=False)) logger.info('number of parameters in the model: %d', tensor.sum([p.size for p in cg.parameters]).eval()) logger.info('Lookup table sizes: %s', [p.size.eval() for p in cg.parameters if 'lt' in p.name]) main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, model=Model(cost), extensions=extensions) main_loop.run()
def Hx_plain(): Hx_plain_splits = TT.grad(TT.sum( [TT.sum(g * x) for g, x in zip(constraint_grads, xs)]), wrt=params, disconnected_inputs='warn') return TT.concatenate([TT.flatten(s) for s in Hx_plain_splits])
def log_softmax(x): xdev = x - x.max(1, keepdims=True) return xdev - T.log(T.sum(T.exp(xdev), axis=1, keepdims=True))
def find_threshold(images): result, updates = th.scan(fn=lambda i: find_val(i), outputs_info=None, sequences=images) return T.sum(result) / images.shape[0]
def evaluate_lenet5(learning_rate=0.01, n_epochs=100, emb_size=40, batch_size=50, describ_max_len=20, type_size=12, filter_size=[3, 5], maxSentLen=100, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/' seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) all_sentences, all_masks, all_labels, word2id = load_BBN_multi_labels_dataset( maxlen=maxSentLen ) #minlen, include one label, at least one word in the sentence label_sent, label_mask = load_SF_type_descriptions(word2id, type_size, describ_max_len) label_sent = np.asarray(label_sent, dtype='int32') label_mask = np.asarray(label_mask, dtype=theano.config.floatX) train_sents = np.asarray(all_sentences[0], dtype='int32') train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX) train_labels = np.asarray(all_labels[0], dtype='int32') train_size = len(train_labels) dev_sents = np.asarray(all_sentences[1], dtype='int32') dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX) dev_labels = np.asarray(all_labels[1], dtype='int32') dev_size = len(dev_labels) ''' combine train and dev ''' train_sents = np.concatenate([train_sents, dev_sents], axis=0) train_masks = np.concatenate([train_masks, dev_masks], axis=0) train_labels = np.concatenate([train_labels, dev_labels], axis=0) train_size = train_size + dev_size test_sents = np.asarray(all_sentences[2], dtype='int32') test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX) test_labels = np.asarray(all_labels[2], dtype='int32') test_size = len(test_labels) vocab_size = len(word2id) + 1 # add one zero pad index rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_fasttext_multiple_word2vec_given_file([ emb_root + 'IL5-cca-wiki-lorelei-d40.eng.vec', emb_root + 'IL5-cca-wiki-lorelei-d40.IL5.vec' ], 40) rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_id_matrix = T.imatrix('sents_id_matrix') sents_mask = T.fmatrix('sents_mask') labels = T.imatrix('labels') #batch*12 des_id_matrix = T.imatrix() des_mask = T.fmatrix() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input = embeddings[sents_id_matrix.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1) #the input format can be adapted into CNN or GRU or LSTM bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2) repeat_common_input = T.repeat( normalize_tensor3_colwise(common_input), type_size, axis=0) #(batch_size*type_size, emb_size, maxsentlen) des_input = embeddings[des_id_matrix.flatten()].reshape( (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1) bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1), axis=2) #(tyope_size, emb_size) repeat_des_input = T.tile( normalize_tensor3_colwise(des_input), (batch_size, 1, 1)) #(batch_size*type_size, emb_size, maxsentlen) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W2, conv_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2] conv_att_W, conv_att_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) conv_att_W2, conv_att_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) conv_W_context2, conv_b_context2 = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) ACNN_para = [ conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2, conv_W_context2 ] # NN_para = multiCNN_para+ACNN_para conv_model = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings = conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size conv_model2 = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2 ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings2 = conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size LR_input = T.concatenate([sent_embeddings, sent_embeddings2, bow_emb], axis=1) LR_input_size = hidden_size[0] * 2 + emb_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_a = create_ensemble_para( rng, 12, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] layer_LR = LogisticRegression( rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector score_matrix = T.nnet.sigmoid(layer_LR.before_softmax) #batch * 12 prob_pos = T.where(labels < 1, 1.0 - score_matrix, score_matrix) loss = -T.mean(T.log(prob_pos)) ''' GRU ''' U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0]) GRU_NN_para = [ U1, W1, b1 ] #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias # gru_input = common_input.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask, hidden_size[0], U1, W1, b1) gru_sent_embeddings = gru_layer.output_sent_rep # (batch_size, hidden_size) LR_att_input = T.concatenate([gru_sent_embeddings, bow_emb], axis=1) LR_att_input_size = hidden_size[0] + emb_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_att_a = create_ensemble_para( rng, 12, LR_att_input_size) # the weight matrix hidden_size*2 LR_att_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_att_para = [U_att_a, LR_att_b] layer_att_LR = LogisticRegression( rng, input=LR_att_input, n_in=LR_att_input_size, n_out=12, W=U_att_a, b=LR_att_b ) #basically it is a multiplication between weight matrix and input feature vector att_score_matrix = T.nnet.sigmoid(layer_att_LR.before_softmax) #batch * 12 att_prob_pos = T.where(labels < 1, 1.0 - att_score_matrix, att_score_matrix) att_loss = -T.mean(T.log(att_prob_pos)) ''' ACNN ''' attentive_conv_layer = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W, b=conv_att_b, W_context=conv_W_context, b_context=conv_b_context) sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l attentive_conv_layer2 = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W2, b=conv_att_b2, W_context=conv_W_context2, b_context=conv_b_context2) sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l acnn_LR_input = T.concatenate( [sent_att_embeddings, sent_att_embeddings2, bow_emb], axis=1) acnn_LR_input_size = hidden_size[0] * 2 + emb_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative acnn_U_a = create_ensemble_para( rng, 12, acnn_LR_input_size) # the weight matrix hidden_size*2 acnn_LR_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class acnn_LR_para = [acnn_U_a, acnn_LR_b] acnn_layer_LR = LogisticRegression( rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=12, W=acnn_U_a, b=acnn_LR_b ) #basically it is a multiplication between weight matrix and input feature vector acnn_score_matrix = T.nnet.sigmoid( acnn_layer_LR.before_softmax) #batch * 12 acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix, acnn_score_matrix) acnn_loss = -T.mean(T.log(acnn_prob_pos)) ''' dataless cosine ''' cosine_scores = normalize_matrix_rowwise(bow_emb).dot( normalize_matrix_rowwise(bow_des).T) cosine_score_matrix = T.nnet.sigmoid( cosine_scores) #(batch_size, type_size) ''' dataless top-30 fine grained cosine ''' fine_grained_cosine = T.batched_dot( repeat_common_input.dimshuffle(0, 2, 1), repeat_des_input) #(batch_size*type_size,maxsentlen,describ_max_len) fine_grained_cosine_to_matrix = fine_grained_cosine.reshape( (batch_size * type_size, maxSentLen * describ_max_len)) sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix, axis=1) top_k_simi = sort_fine_grained_cosine_to_matrix[:, -30:] # (batch_size*type_size, 5) max_fine_grained_cosine = T.mean(top_k_simi, axis=1) top_k_cosine_scores = max_fine_grained_cosine.reshape( (batch_size, type_size)) top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores) params = multiCNN_para + LR_para + GRU_NN_para + LR_att_para + ACNN_para + acnn_LR_para # put all model parameters together cost = loss + att_loss + acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum()) updates = Gradient_Cost_Para(cost, params, learning_rate) ''' testing ''' ensemble_NN_scores = T.max(T.concatenate([ att_score_matrix.dimshuffle('x', 0, 1), score_matrix.dimshuffle('x', 0, 1), acnn_score_matrix.dimshuffle('x', 0, 1) ], axis=0), axis=0) # ''' # majority voting, does not work # ''' # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0) # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0) # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0) # binarize_conc = T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0) # sum_binarize_conc = T.sum(binarize_conc,axis=0) # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0) # ''' # sum up prob, works # ''' # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0) ''' sum up prob, works ''' ensemble_scores = 0.6 * ensemble_NN_scores + 0.4 * 0.5 * ( cosine_score_matrix + top_k_score_matrix) binarize_prob = T.where(ensemble_scores > 0.3, 1, 0) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_model = theano.function( [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_id_matrix, sents_mask, des_id_matrix, des_mask], binarize_prob, allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] # n_dev_batches=dev_size/batch_size # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] # max_acc_dev=0.0 max_meanf1_test = 0.0 max_weightf1_test = 0.0 train_indices = range(train_size) cost_i = 0.0 while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle(train_indices) iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents[train_id_batch], train_masks[train_id_batch], train_labels[train_id_batch], label_sent, label_mask) #after each 1000 batches, we test the performance of the model on all test data if iter % 20 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() error_sum = 0.0 all_pred_labels = [] all_gold_labels = [] for test_batch_id in test_batch_start: # for each test batch pred_labels = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_masks[test_batch_id:test_batch_id + batch_size], label_sent, label_mask) gold_labels = test_labels[test_batch_id:test_batch_id + batch_size] # print 'pred_labels:', pred_labels # print 'gold_labels;', gold_labels all_pred_labels.append(pred_labels) all_gold_labels.append(gold_labels) all_pred_labels = np.concatenate(all_pred_labels) all_gold_labels = np.concatenate(all_gold_labels) test_mean_f1, test_weight_f1 = average_f1_two_array_by_col( all_pred_labels, all_gold_labels) if test_weight_f1 > max_weightf1_test: max_weightf1_test = test_weight_f1 if test_mean_f1 > max_meanf1_test: max_meanf1_test = test_mean_f1 print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def main(): sym_y = T.imatrix('target_output') sym_mask = T.matrix('mask') sym_x = T.tensor3() TOL = 1e-5 num_epochs = config.epochs batch_size = config.batch_size #### DATA #### # print "@@@@TESTING@@@@" # l_in = nn.layers.InputLayer(shape=(None, 700, 42)) # l_dim_a = nn.layers.DimshuffleLayer( # l_in, (0,2,1)) # l_conv_a = nn.layers.Conv1DLayer( # incoming=l_dim_a, num_filters=42, border_mode='same', # filter_size=3, stride=1, nonlinearity=nn.nonlinearities.rectify) # l_dim_b = nn.layers.DimshuffleLayer( # l_conv_a, (0,2,1)) # out = nn.layers.get_output(l_dim_b, sym_x) # testvar = np.ones((128, 700, 42)).astype('float32') # print "@@@@EVAL@@@@" # john = out.eval({sym_x: testvar}) # print("Johns shape") # print(john.shape) print("Building network ...") ##########################DEBUG########################## l_in, l_out = config.build_model() ##########################DEBUG########################## all_layers = nn.layers.get_all_layers(l_out) num_params = nn.layers.count_params(l_out) print(" number of parameters: %d" % num_params) print(" layer output shapes:") for layer in all_layers: name = string.ljust(layer.__class__.__name__, 32) print(" %s %s" % (name, nn.layers.get_output_shape(layer))) print("Creating cost function") # lasagne.layers.get_output produces a variable for the output of the net out_train = nn.layers.get_output( l_out, sym_x, deterministic=False) # testvar = np.ones((128, 700, 42)).astype('float32') # john = out_train.eval({sym_x: testvar}) # print("@@@@@JOHN@@@@@") # print(john.shape) # print(john.reshape((-1, num_classes)).shape) print("Creating eval function") out_eval = nn.layers.get_output( l_out, sym_x, deterministic=True) probs_flat = out_train.reshape((-1, num_classes)) lambda_reg = config.lambda_reg params = nn.layers.get_all_params(l_out, regularizable=True) reg_term = sum(T.sum(p ** 2) for p in params) cost = T.nnet.categorical_crossentropy(T.clip(probs_flat, TOL, 1 - TOL), sym_y.flatten()) cost = T.sum(cost * sym_mask.flatten()) / T.sum(sym_mask) + lambda_reg * reg_term # Retrieve all parameters from the network all_params = nn.layers.get_all_params(l_out, trainable=True) # Setting the weights if hasattr(config, 'set_weights'): nn.layers.set_all_param_values(l_out, config.set_weights()) # Compute SGD updates for training print("Computing updates ...") if hasattr(config, 'learning_rate_schedule'): learning_rate_schedule = config.learning_rate_schedule # Import learning rate schedule else: learning_rate_schedule = {0: config.learning_rate} learning_rate = theano.shared(np.float32(learning_rate_schedule[0])) all_grads = T.grad(cost, all_params) cut_norm = config.cut_grad updates, norm_calc = nn.updates.total_norm_constraint(all_grads, max_norm=cut_norm, return_norm=True) if optimizer == "rmsprop": updates = nn.updates.rmsprop(updates, all_params, learning_rate) elif optimizer == "adadelta": updates = nn.updates.adadelta(updates, all_params, learning_rate) elif optimizer == "adagrad": updates = nn.updates.adagrad(updates, all_params, learning_rate) elif optimizer == "nag": momentum_schedule = config.momentum_schedule momentum = theano.shared(np.float32(momentum_schedule[0])) updates = nn.updates.nesterov_momentum(updates, all_params, learning_rate, momentum) else: sys.exit("please choose either <rmsprop/adagrad/adadelta/nag> in configfile") # Theano functions for training and computing cost print("config.batch_size %d" % batch_size) print("data.num_classes %d" % num_classes) if hasattr(config, 'build_model'): print("has build model") print("Compiling train ...") # Use this for training (see deterministic = False above) train = theano.function( [sym_x, sym_y, sym_mask], [cost, out_train, norm_calc], updates=updates) print("Compiling eval ...") # use this for eval (deterministic = True + no updates) eval = theano.function([sym_x, sym_y, sym_mask], [cost, out_eval]) # Start timers start_time = time.time() prev_time = start_time all_losses_train = [] all_accuracy_train = [] all_losses_eval_train = [] all_losses_eval_valid = [] all_losses_eval_test = [] all_accuracy_eval_train = [] all_accuracy_eval_valid = [] all_accuracy_eval_test = [] all_mean_norm = [] import data X_train, X_valid, y_train, y_valid, mask_train, mask_valid, num_seq_train \ = data.get_train() print("y shape") print(y_valid.shape) print("X shape") print(X_valid.shape) # Start training for epoch in range(num_epochs): if (epoch % 10) == 0: print("Epoch %d of %d" % (epoch + 1, num_epochs)) if epoch in learning_rate_schedule: lr = np.float32(learning_rate_schedule[epoch]) print(" setting learning rate to %.7f" % lr) learning_rate.set_value(lr) if optimizer == "nag": if epoch in momentum_schedule: mu = np.float32(momentum_schedule[epoch]) print(" setting learning rate to %.7f" % mu) momentum.set_value(mu) print("Shuffling data") seq_names = np.arange(0, num_seq_train) np.random.shuffle(seq_names) X_train = X_train[seq_names] y_train = y_train[seq_names] mask_train = mask_train[seq_names] num_batches = num_seq_train // batch_size losses = [] preds = [] norms = [] for i in range(num_batches): idx = range(i * batch_size, (i + 1) * batch_size) x_batch = X_train[idx] y_batch = y_train[idx] mask_batch = mask_train[idx] loss, out, batch_norm = train(x_batch, y_batch, mask_batch) print(batch_norm) norms.append(batch_norm) preds.append(out) losses.append(loss) # if ((i+1) % config.write_every_batch == 0) | (i == 0): # if i == 0: # start_place = 0 # else: # start_place = i-config.write_every_batch # print "Batch %d of %d" % (i + 1, num_batches) # print " curbatch training loss: %.5f" % np.mean(losses[start_place:(i+1)]) # print " curbatch training acc: %.5f" % np.mean(accuracy[start_place:(i+1)]) predictions = np.concatenate(preds, axis=0) loss_train = np.mean(losses) all_losses_train.append(loss_train) acc_train = utils.proteins_acc(predictions, y_train[0:num_batches * batch_size], mask_train[0:num_batches * batch_size]) all_accuracy_train.append(acc_train) mean_norm = np.mean(norms) all_mean_norm.append(mean_norm) if 1 == 1: print(" average training loss: %.5f" % loss_train) print(" average training accuracy: %.5f" % acc_train) print(" average norm: %.5f" % mean_norm) sets = [ # ('train', X_train, y_train, mask_train, all_losses_eval_train, all_accuracy_eval_train), ('valid', X_valid, y_valid, mask_valid, all_losses_eval_valid, all_accuracy_eval_valid)] for subset, X, y, mask, all_losses, all_accuracy in sets: print(" validating: %s loss" % subset) preds = [] num_batches = np.size(X, axis=0) // config.batch_size for i in range(num_batches): ## +1 to get the "rest" print(i) idx = range(i * batch_size, (i + 1) * batch_size) x_batch = X[idx] y_batch = y[idx] mask_batch = mask[idx] loss, out = eval(x_batch, y_batch, mask_batch) preds.append(out) # acc = utils.proteins_acc(out, y_batch, mask_batch) losses.append(loss) # accuracy.append(acc) predictions = np.concatenate(preds, axis=0) print " pred" print(predictions.shape) print(predictions.dtype) loss_eval = np.mean(losses) all_losses.append(loss_eval) # acc_eval = np.mean(accuracy) acc_eval = utils.proteins_acc(predictions, y, mask) all_accuracy.append(acc_eval) print " average evaluation loss (%s): %.5f" % (subset, loss_eval) print " average evaluation accuracy (%s): %.5f" % (subset, acc_eval) now = time.time() time_since_start = now - start_time time_since_prev = now - prev_time prev_time = now est_time_left = time_since_start * num_epochs eta = datetime.now() + timedelta(seconds=est_time_left) eta_str = eta.strftime("%c") print " %s since start (%.2f s)" % (utils.hms(time_since_start), time_since_prev) print " estimated %s to go (ETA: %s)" % (utils.hms(est_time_left), eta_str) print if (epoch >= config.start_saving_at) and ((epoch % config.save_every) == 0): print " saving parameters and metadata" with open((metadata_path + "-%d" % (epoch) + ".pkl"), 'w') as f: pickle.dump({ 'config_name': config_name, 'param_values': nn.layers.get_all_param_values(l_out), 'losses_train': all_losses_train, 'accuracy_train': all_accuracy_train, 'losses_eval_train': all_losses_eval_train, 'losses_eval_valid': all_losses_eval_valid, 'losses_eval_test': all_losses_eval_test, 'accuracy_eval_valid': all_accuracy_eval_valid, 'accuracy_eval_train': all_accuracy_eval_train, 'accuracy_eval_test': all_accuracy_eval_test, 'mean_norm': all_mean_norm, 'time_since_start': time_since_start, 'i': i, }, f, pickle.HIGHEST_PROTOCOL) print(" stored in %s" % metadata_path) print
def check(X, i, j, a, b): k = T.sum(X[:, i:i + a, j:j + b]) return k
def import_F(self, train_set_x, train_weight, train_label, batch_size): index = T.iscalar() print('import_Modeling...') self.f = { n: theano.function( [], f, name=n, givens={ self.X: train_set_x, self.Xlabel: train_label, self.Weight: train_weight }, on_unused_input='ignore') for n, f in zip(['U', 'KL_U', 'KL_X'], [self.U, self.KL_U, self.KL_X]) } self.f['LL'] = theano.function( [index], outputs=self.LL, givens={ self.X: train_set_x[index * batch_size:(index + 1) * batch_size], self.Xlabel: train_label[index * batch_size:(index + 1) * batch_size], self.Weight: train_weight }, on_unused_input='ignore') z = 0.0 * sum([T.sum(v) for v in self.params]) self.g = { vn: { gn: theano.function( [], T.grad(gv + z, vv), name='d' + gn + '_d' + vn, givens={ self.X: train_set_x, self.Xlabel: train_label, self.Weight: train_weight }, on_unused_input='ignore') for gn, gv in zip(['KL_U', 'KL_X'], [self.KL_U, self.KL_X]) } for vn, vv in self.wrt.items() } for vn, vv in self.wrt.items(): self.g[vn]['LL'] = theano.function( [index], T.grad(self.LL + z, vv), name='dLL' + '_d' + vn, givens={ self.X: train_set_x[index * batch_size:(index + 1) * batch_size], self.Xlabel: train_label[index * batch_size:(index + 1) * batch_size], self.Weight: train_weight }, on_unused_input='ignore')
def __init__(self, We_initial, params): initial_We = theano.shared(np.asarray(We_initial, dtype=config.floatX)) We = theano.shared(np.asarray(We_initial, dtype=config.floatX)) #symbolic params g1batchindices = T.imatrix() g2batchindices = T.imatrix() p1batchindices = T.imatrix() p2batchindices = T.imatrix() #get embeddings l_in = lasagne.layers.InputLayer((None, None, 1)) l_emb = lasagne.layers.EmbeddingLayer( l_in, input_size=We.get_value().shape[0], output_size=We.get_value().shape[1], W=We) l_average = lasagne_average_layer([l_emb]) embg1 = lasagne.layers.get_output(l_average, {l_in: g1batchindices}) embg2 = lasagne.layers.get_output(l_average, {l_in: g2batchindices}) embp1 = lasagne.layers.get_output(l_average, {l_in: p1batchindices}) embp2 = lasagne.layers.get_output(l_average, {l_in: p2batchindices}) #objective function g1g2 = (embg1 * embg2).sum(axis=1) g1g2norm = T.sqrt(T.sum(embg1**2, axis=1)) * T.sqrt( T.sum(embg2**2, axis=1)) g1g2 = g1g2 / g1g2norm p1g1 = (embp1 * embg1).sum(axis=1) p1g1norm = T.sqrt(T.sum(embp1**2, axis=1)) * T.sqrt( T.sum(embg1**2, axis=1)) p1g1 = p1g1 / p1g1norm p2g2 = (embp2 * embg2).sum(axis=1) p2g2norm = T.sqrt(T.sum(embp2**2, axis=1)) * T.sqrt( T.sum(embg2**2, axis=1)) p2g2 = p2g2 / p2g2norm costp1g1 = params.margin - g1g2 + p1g1 costp1g1 = costp1g1 * (costp1g1 > 0) costp2g2 = params.margin - g1g2 + p2g2 costp2g2 = costp2g2 * (costp2g2 > 0) cost = costp1g1 + costp2g2 self.all_params = lasagne.layers.get_all_params(l_average, trainable=True) self.network_params = lasagne.layers.get_all_params(l_average, trainable=True) self.network_params.pop(0) word_reg = 0.5 * params.LW * lasagne.regularization.l2(We - initial_We) cost = T.mean(cost) + word_reg #feedforward self.feedforward_function = theano.function([g1batchindices], embg1) self.cost_function = theano.function( [g1batchindices, g2batchindices, p1batchindices, p2batchindices], cost) prediction = g1g2 self.scoring_function = theano.function( [g1batchindices, g2batchindices], prediction) #updates if params.updatewords: grads = theano.gradient.grad(cost, self.all_params) if params.clip: grads = [ lasagne.updates.norm_constraint(grad, params.clip, range(grad.ndim)) for grad in grads ] updates = params.learner(grads, self.all_params, params.eta) else: grads = theano.gradient.grad(cost, self.network_params) if params.clip: grads = [ lasagne.updates.norm_constraint(grad, params.clip, range(grad.ndim)) for grad in grads ] updates = params.learner(grads, self.network_params, params.eta) self.train_function = theano.function( [g1batchindices, g2batchindices, p1batchindices, p2batchindices], cost, updates=updates)
def log_mvn(self, y, mean, beta): #対角ノイズ、YはN×Dのデータ,それの正規分布の対数尤度 N = y.shape[0] D = y.shape[1] return -0.5 * D * T.sum(T.log(2 * np.pi * (1 / T.diag(beta)))) - 0.5 * T.sum( T.dot(beta, (y - mean)**2))
def LINnn(self, sl2, X): return sl2 * (T.sum(X**2, 1) + 1) + eps
import numpy as np import theano import theano.tensor as tt from theano.tests import unittest_tools as utt from kepler_op import KeplerOp kepler = KeplerOp() M = tt.dmatrix() e = tt.dmatrix() E = kepler(M, e) f = theano.function([M, e], E) g = theano.function([M, e], theano.grad(tt.sum(E), [M, e])) np.random.seed(42) N = (10, 2) pt = [np.random.uniform(5, 10, N), np.random.rand(*N)] print(f(*pt)) utt.verify_grad(kepler, pt)
def compile_F(self, train_set_x, train_weight, train_label, batch_size): index = T.iscalar() print('Modeling...') model_file_name = 'model2' + '.save' #もしこれまでに作ったのがあるならロードする try: print('Trying to load model...') with open(model_file_name, 'rb') as file_handle: obj = pickle.load(file_handle) self.f, self.g = obj print('Loaded!') return except: print('Failed. Creating a new model...') self.f = { n: theano.function( [], f, name=n, givens={ self.X: train_set_x, self.Xlabel: train_label, self.Weight: train_weight }, on_unused_input='ignore') for n, f in zip(['U', 'KL_U', 'KL_X'], [self.U, self.KL_U, self.KL_X]) } self.f['LL'] = theano.function( [index], outputs=self.LL, givens={ self.X: train_set_x[index * batch_size:(index + 1) * batch_size], self.Xlabel: train_label[index * batch_size:(index + 1) * batch_size], self.Weight: train_weight }, on_unused_input='ignore') z = 0.0 * sum([T.sum(v) for v in self.params]) self.g = { vn: { gn: theano.function( [], T.grad(gv + z, vv), name='d' + gn + '_d' + vn, givens={ self.X: train_set_x, self.Xlabel: train_label, self.Weight: train_weight }, on_unused_input='ignore') for gn, gv in zip(['KL_U', 'KL_X'], [self.KL_U, self.KL_X]) } for vn, vv in self.wrt.items() } for vn, vv in self.wrt.items(): self.g[vn]['LL'] = theano.function( [index], T.grad(self.LL + z, vv), name='dLL' + '_d' + vn, givens={ self.X: train_set_x[index * batch_size:(index + 1) * batch_size], self.Xlabel: train_label[index * batch_size:(index + 1) * batch_size], self.Weight: train_weight }, on_unused_input='ignore') with open(model_file_name, 'wb') as file_handle: print('Saving model...') sys.setrecursionlimit(100000) pickle.dump([self.f, self.g], file_handle, protocol=pickle.HIGHEST_PROTOCOL)
def cmmd(dataset='mnist.pkl.gz',batch_size=500, layer_num = 2, hidden_dim = 20,seed = 0,layer_size=[500,200,100]): validation_frequency = 1 test_frequency = 1 pre_train = 0 pre_train_epoch = 30 print "Loading data ......." datasets = datapy.load_data_gpu_60000(dataset, have_matrix = True) train_set_x, train_set_y, train_y_matrix = datasets[0] valid_set_x, valid_set_y, valid_y_matrix = datasets[1] test_set_x, test_set_y, test_y_matrix = datasets[2] n_train_batches = train_set_x.get_value().shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size rng = np.random.RandomState(seed) rng_share = theano.tensor.shared_randomstreams.RandomStreams(0) ################################ ## build model ## ################################ print "Building model ......." index = T.lscalar() x = T.matrix('x') ##### batch_size * 28^2 y = T.vector('y') y_matrix = T.matrix('y_matrix') random_z = T.matrix('random_z') ### batch_size * hidden_dim Inv_K_d = T.matrix('Inv_K_d') layers = [] layer_output= [] activation = nonlinearity.relu #activation = Tnn.sigmoid #### first layer layers.append(FullyConnected.FullyConnected( rng = rng, n_in = 28*28 + hidden_dim, #n_in = 28*28, n_out = layer_size[0], activation = activation )) layer_output.append(layers[-1].output_mix(input=[x,random_z])) #layer_output.append(layers[-1].output(input=x)) #### middle layer for i in range(layer_num): layers.append(FullyConnected.FullyConnected( rng = rng, n_in = layer_size[i], n_out = layer_size[i+1], activation = activation )) layer_output.append(layers[-1].output(input= layer_output[-1])) #### last layer activation = Tnn.sigmoid layers.append(FullyConnected.FullyConnected( rng = rng, n_in = layer_size[-1], n_out = 10, activation = activation )) y_gen = layers[-1].output(input = layer_output[-1]) lambda1_ = 1e-3 lambda_= theano.shared(np.asarray(lambda1_, dtype=np.float32)) K_d = kernel_gram_for_x(x,x,batch_size,28*28) K_s = K_d K_sd = K_d #Inv_K_d = NL.matrix_inverse(K_d +lambda_ * T.identity_like(K_d)) Inv_K_s = Inv_K_d L_d = kernel_gram(y_matrix,y_matrix,batch_size,10) L_s = kernel_gram(y_gen,y_gen,batch_size,10) L_ds = kernel_gram(y_matrix,y_gen,batch_size,10) cost = -(NL.trace(K_d * Inv_K_d * L_d * Inv_K_d) +\ NL.trace(K_s * Inv_K_s * L_s * Inv_K_s)- \ NL.trace(K_sd * Inv_K_d * L_ds * Inv_K_s)) cost_pre = -T.sum(T.sqr(y_matrix - y_gen)) cc = T.argmax(y_gen,axis=1) correct = T.sum(T.eq(T.cast(T.argmax(y_gen,axis=1),'int32'),T.cast(y,'int32'))) ################################ ## updates ## ################################ params = [] for aLayer in layers: params += aLayer.params gparams = [T.grad(cost,param) for param in params] gparams_pre = [T.grad(cost_pre,param) for param in params] learning_rate = 3e-4 weight_decay=1.0/n_train_batches epsilon=1e-8 l_r = theano.shared(np.asarray(learning_rate, dtype=np.float32)) get_optimizer = optimizer.get_adam_optimizer_max(learning_rate=l_r, decay1=0.1, decay2=0.001, weight_decay=weight_decay, epsilon=epsilon) updates = get_optimizer(params,gparams) updates_pre = get_optimizer(params,gparams_pre) ################################ ## pretrain model ## ################################ parameters = theano.function( inputs = [], outputs = params, ) ''' pre_train_model = theano.function( inputs = [index,random_z], outputs = [cost_pre, correct], updates=updates_pre, givens={ x:train_set_x[index * batch_size:(index + 1) * batch_size], y:train_set_y[index * batch_size:(index + 1) * batch_size], y_matrix:train_y_matrix[index * batch_size:(index + 1) * batch_size], }, on_unused_input='warn' ) cur_epoch = 0 if pre_train == 1: for cur_epoch in range(pre_train_epoch): print 'cur_epoch: ', cur_epoch, cor = 0 for minibatch_index in range(n_train_batches): cost_pre_mini,correct_pre_mini = pre_train_model(minibatch_index,gen_random_z(batch_size,hidden_dim)) cor = cor + correct_pre_mini print 'correct number: ' , cor #np.savez(,model = model) ''' if pre_train == 1: print "pre-training model....." pre_train = np.load('model.npz')['model'] for (para, pre) in zip(params, pre_train): para.set_value(pre) ################################ ## prepare data ## ################################ #### compute matrix inverse print "Preparing data ...." Invv = NL.matrix_inverse(K_d +lambda_ * T.identity_like(K_d)) prepare_data = theano.function( inputs = [index], outputs = [Invv,K_d], givens = { x:train_set_x[index * batch_size:(index + 1) * batch_size], } ) Inv_K_d_l, K_d_l = prepare_data(0) for minibatch_index in range(1, n_train_batches): if minibatch_index % 10 == 0: print 'minibatch_index:', minibatch_index Inv_pre_mini, K_d_pre_mini = prepare_data(minibatch_index) Inv_K_d_l = np.vstack((Inv_K_d_l,Inv_pre_mini)) K_d_l = np.vstack((K_d_l,K_d_pre_mini)) Inv_K_d_g = theano.shared(Inv_K_d_l,borrow=True) K_d_g = theano.shared(K_d_l, borrow=True) ################################ ## train model ## ################################ train_model = theano.function( inputs = [index,random_z], outputs = [correct,cost,y,cc,y_gen], updates=updates, givens={ x:train_set_x[index * batch_size:(index + 1) * batch_size], y:train_set_y[index * batch_size:(index + 1) * batch_size], y_matrix:train_y_matrix[index * batch_size:(index + 1) * batch_size], #K_d:K_d_g[index * batch_size:(index + 1) * batch_size], Inv_K_d:Inv_K_d_g[index * batch_size:(index + 1) * batch_size], }, on_unused_input='warn' ) valid_model = theano.function( inputs = [index,random_z], outputs = correct, #updates=updates, givens={ x:valid_set_x[index * batch_size:(index + 1) * batch_size], y:valid_set_y[index * batch_size:(index + 1) * batch_size], y_matrix:valid_y_matrix[index * batch_size:(index + 1) * batch_size], }, on_unused_input='warn' ) test_model = theano.function( inputs = [index,random_z], outputs = [correct,y_gen], #updates=updates, givens={ x:test_set_x[index * batch_size:(index + 1) * batch_size], y:test_set_y[index * batch_size:(index + 1) * batch_size], y_matrix:test_y_matrix[index * batch_size:(index + 1) * batch_size], }, on_unused_input='warn' ) n_epochs = 500 cur_epoch = 0 print "Training model ......" while (cur_epoch < n_epochs) : cur_epoch = cur_epoch + 1 cor = 0 for minibatch_index in xrange(n_train_batches): print minibatch_index, print " : ", correct,cost,a,b,y_gen = train_model(minibatch_index,gen_random_z(batch_size,hidden_dim)) cor = cor + correct print correct print b print y_gen with open('log.txt','a') as f: print >>f , "epoch: " , cur_epoch, "training_correct: " , cor if cur_epoch % validation_frequency == 0: cor2 = 0 for minibatch_index in xrange(n_valid_batches): correct = valid_model(minibatch_index,gen_random_z(batch_size,hidden_dim)) cor2 = cor2 + correct with open('log.txt','a') as f: print >>f , " validation_correct: " , cor2 if cur_epoch % test_frequency == 0: cor2 = 0 for minibatch_index in xrange(n_test_batches): correct,y_gen = test_model(minibatch_index,gen_random_z(batch_size,hidden_dim)) with open('log.txt','a') as f: for index in range(batch_size): if not np.argmax(y_gen[index]) == test_set_y[minibatch_index * batch_size + index]: print >>f , "index: " , minibatch_index * batch_size + index, 'true Y: ', test_set_y[minibatch_index * batch_size + index] print >>f , 'gen_y: ' , y_gen[index] cor2 = cor2 + correct with open('log.txt','a') as f: print >>f , " test_correct: " , cor2 if epoch %1 == 0: model = parameters() for i in range(len(model)): model[i] = np.asarray(model[i]).astype(np.float32) np.savez('model-'+str(epoch),model=model)
def __init__(self, D, M, Q, Domain_number, m, pre_params, Hiddenlayerdim1, Hiddenlayerdim2): self.Xlabel = T.matrix('Xlabel') self.X = T.matrix('X') N = self.X.shape[0] self.Weight = T.matrix('Weight') ker = kernel(Q) mmd = MMD(M, Domain_number) mu_value = np.random.randn(M, D) Sigma_b_value = np.zeros((M, M)) + np.log(0.01) Z_value = m[:M] self.test = Z_value ls_value = np.zeros(Domain_number) + np.log(0.1) self.mu = theano.shared(value=mu_value, name='mu', borrow=True) self.Sigma_b = theano.shared(value=Sigma_b_value, name='Sigma_b', borrow=True) self.Z = theano.shared(value=Z_value, name='Z', borrow=True) self.ls = theano.shared(value=ls_value, name='ls', borrow=True) self.params = [self.mu, self.Sigma_b, self.Z, self.ls] self.hiddenLayer_x = HiddenLayer(rng=rng, input=self.X, n_in=D, n_out=Hiddenlayerdim1, activation=T.nnet.relu, number='_x') self.hiddenLayer_hidden = HiddenLayer(rng=rng, input=self.hiddenLayer_x.output, n_in=Hiddenlayerdim1, n_out=Hiddenlayerdim2, activation=T.nnet.relu, number='_h') self.hiddenLayer_m = HiddenLayer(rng=rng, input=self.hiddenLayer_hidden.output, n_in=Hiddenlayerdim2, n_out=Q, activation=T.nnet.relu, number='_m') self.hiddenLayer_S = HiddenLayer(rng=rng, input=self.hiddenLayer_hidden.output, n_in=Hiddenlayerdim2, n_out=Q, activation=T.nnet.relu, number='_S') self.loc_params = [] self.loc_params.extend(self.hiddenLayer_x.params) self.loc_params.extend(self.hiddenLayer_hidden.params) self.loc_params.extend(self.hiddenLayer_m.params) self.loc_params.extend(self.hiddenLayer_S.params) self.local_params = {} for i in self.loc_params: self.local_params[str(i)] = i self.params.extend(ker.params) self.params.extend(mmd.params) self.hyp_params = {} for i in [self.mu, self.Sigma_b, self.ls]: self.hyp_params[str(i)] = i self.Z_params = {} for i in [self.Z]: self.Z_params[str(i)] = i self.global_params = {} for i in self.params: self.global_params[str(i)] = i self.params.extend(self.hiddenLayer_x.params) self.params.extend(self.hiddenLayer_hidden.params) self.params.extend(self.hiddenLayer_m.params) self.params.extend(self.hiddenLayer_S.params) self.wrt = {} for i in self.params: self.wrt[str(i)] = i for i, j in pre_params.items(): self.wrt[i].set_value(j) m = self.hiddenLayer_m.output S_0 = self.hiddenLayer_S.output S_1 = T.exp(S_0) S = T.sqrt(S_1) from theano.tensor.shared_randomstreams import RandomStreams srng = RandomStreams(seed=234) eps_NQ = srng.normal((N, Q)) eps_M = srng.normal((M, D)) #平均と分散で違う乱数を使う必要があるので別々に銘銘 beta = T.exp(self.ls) #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) + T.diag(T.exp(T.diag(self.Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = ker.sf2**0.5 * self.mu, ker.sf2**0.5 * Sigma Xtilda = m + S * eps_NQ self.U = mu_scaled + Sigma_scaled.dot(eps_M) Kmm = ker.RBF(self.Z) Kmm = mmd.MMD_kenel_Xonly(mmd.Zlabel_T, Kmm, self.Weight) KmmInv = sT.matrix_inverse(Kmm) Kmn = ker.RBF(self.Z, Xtilda) Kmn = mmd.MMD_kenel_ZX(self.Xlabel, Kmn, self.Weight) Knn = ker.RBF(Xtilda) Knn = mmd.MMD_kenel_Xonly(self.Xlabel, Knn, self.Weight) Ktilda = Knn - T.dot(Kmn.T, T.dot(KmmInv, Kmn)) Kinterval = T.dot(KmmInv, Kmn) mean_U = T.dot(Kinterval.T, self.U) betaI = T.diag(T.dot(self.Xlabel, beta)) Covariance = betaI self.LL = (self.log_mvn(self.X, mean_U, Covariance) - 0.5 * T.sum(T.dot(betaI, Ktilda))) self.KL_X = -self.KLD_X(m, S) self.KL_U = -self.KLD_U(mu_scaled, Sigma_scaled, Kmm, KmmInv)