Example #1
0
    def _calc_regularization_cost(self):
        """Calculate the regularization cost given the weight decay parameters.

        Only the parameters will be considered that are stored in the set
        self.regularize.

        Returns
        -------
        theano variable
            regularization cost depending on the parameters to be regularized
            and the weight decay parameters for L1 and L2 regularization.
        """
        cost = theano.shared(value=np.cast[floatX](.0))
        l1_cost = 0
        l2_cost = 0

        for p in self.regularize:
            l1_cost += T.sum(T.abs_(self.__dict__[p]))
            l2_cost += T.sum(T.sqr(self.__dict__[p]))

        l1_cost = debug_print(l1_cost, 'l1_cost')
        l2_cost = debug_print(l2_cost, 'l2_cost')

        if self.l1_weight != 0:
            cost += self.l1_weight * l1_cost

        if self.l2_weight != 0:
            cost += self.l2_weight * l2_cost

        return cost
Example #2
0
def _elbo_t(logp, uw, inarray, n_mcsamples, random_seed):
    """Create Theano tensor of approximate ELBO by Monte Carlo sampling.
    """
    l = (uw.size / 2).astype('int64')
    u = uw[:l]
    w = uw[l:]

    # Callable tensor
    logp_ = lambda input: theano.clone(logp, {inarray: input}, strict=False)

    # Naive Monte-Carlo
    r = MRG_RandomStreams(seed=random_seed)

    if n_mcsamples == 1:
        n = r.normal(size=inarray.tag.test_value.shape)
        q = n * exp(w) + u
        elbo = logp_(q) + tt.sum(w) + 0.5 * l * (1 + np.log(2.0 * np.pi))
    else:
        n = r.normal(size=(n_mcsamples, u.tag.test_value.shape[0]))
        qs = n * exp(w) + u
        logps, _ = theano.scan(fn=lambda q: logp_(q),
                               outputs_info=None,
                               sequences=[qs])
        elbo = tt.mean(logps) + tt.sum(w) + 0.5 * l * (1 + np.log(2.0 * np.pi))

    return elbo
Example #3
0
    def apply_rnnlm(self, sentence, sentence_mask, sentence_morph, sentence_morph_mask,
                    sentence_char, sentence_char_mask, use_noise=1):
        src, src_mask = sentence[:-1], sentence_mask[:-1]
        tgt, tgt_mask = sentence[1:], sentence_mask[1:]
        src_morph, src_morph_mask = sentence_morph[:-1], sentence_morph_mask[:-1]
        src_char, src_char_mask = sentence_char[:-1], sentence_char_mask[:-1]

        emb_lstm_range = T.arange(self.n_emb_lstm)
        #word lookup table
        table = LookupTable(self.n_emb_lstm, self.vocab_size, name='Wemb')
        src_emb = table.apply(src, emb_lstm_range)
        self.layers.append(table)

        if self.dropout < 1.0:
            src_emb = DropoutLayer(src_emb, use_noise, self.dropout)

        rnn_layer_1st = NormalRNN(self.n_emb_lstm, self.n_hids)
        hiddens = rnn_layer_1st.apply(src_emb, src_mask)
        self.layers.append(rnn_layer_1st)

        logistic_layer = LogisticRegression(hiddens, self.n_hids, self.vocab_size)
        self.layers.append(logistic_layer)

        self.cost = logistic_layer.cost(tgt, tgt_mask)
        for layer in self.layers:
            self.params.extend(layer.params)
        self.L2 = sum(T.sum(item ** 2) for item in self.params)
        self.L1 = sum(T.sum(abs(item)) for item in self.params)
Example #4
0
    def _calc_regularization_cost(self):
        """Calculate the regularization cost given the weight decay parameters.

        Only the parameters will be considered that are stored in the set
        self.regularize. We need to handle it manually in this class, because
        the weight matrices contain bias columns, which should not be considered
        in regularization computation. Therefore, do not!!! add W1 and W2 to
        self.regularize

        Returns
        -------
        theano variable
            regularization cost depending on the parameters to be regularized
            and the weight decay parameters for L1 and L2 regularization.
        """
        cost = super(SLmNce, self)._calc_regularization_cost()
        l1_cost = T.sum(T.abs_(self.W1[:, :-1]))
        l1_cost += T.sum(T.abs_(self.W2[:, :-1]))
        l2_cost = T.sum(T.sqr(self.W1[:, :-1]))
        l2_cost += T.sum(T.sqr(self.W2[:, :-1]))

        if self.l1_weight != 0:
            cost += self.l1_weight * l1_cost

        if self.l2_weight != 0:
            cost += self.l2_weight * l2_cost

        return cost
Example #5
0
    def _compute_local_cn_acts(self, input, W):
        # Without Scan (Faster than scan, but still way too slow)
        shuffledIn    = input.dimshuffle(0,1,'x')
        shuffledMasks = self.localmask.dimshuffle('x',0,1)

        # cubeIn = T.repeat(shuffledIn,self.localmask.shape[1],2)
        # cubeMasks = T.repeat(shuffledMasks,input.shape[0],0)

        maskedIn     = shuffledIn * shuffledMasks
        maskedInMean = T.sum(maskedIn,axis=1,keepdims=True) / T.sum(shuffledMasks,axis=1,keepdims=True)
        maskedInVar  = T.sum(T.sqr((maskedIn-maskedInMean)*shuffledMasks),axis=1,keepdims=True)/T.sum(shuffledMasks,axis=1,keepdims=True)
        maskedInSTD  = T.sqrt(maskedInVar)

        maskedInSubMean = maskedIn - maskedInMean
        maskedCN        = maskedInSubMean / maskedInSTD
        # maskedCN = maskedInSubMean

        shuffledInCN = maskedCN.dimshuffle(2,0,1)

        allOuts      = T.dot(shuffledInCN, W)

        diagMask     = T.eye(self.localmask.shape[1],self.localmask.shape[1]).dimshuffle(0,'x',1)
        diagMaskAll  = allOuts * diagMask

        activation   = T.sum(diagMaskAll,axis=0)
        return activation
Example #6
0
  def __init__(self, kernel, max_iter = 10, max_diff = None):
    """

    :param kernel: a function with a signature (expected, observed) -> a similarity measure
    that accepts symbolic theano expressions and returns them accordingly.
    See `crayimage.hotornot.em.kernels` for examples.
    :param max_iter: maximal number of iteration
    :param max_diff: stop iterations if maximal difference in weights from the previous iteration is smaller than `max_diff`.
    If None the check is not performed.
    """
    self.original_shape = None

    self.kernel = kernel
    self.max_iter = max_iter
    self.max_diff = max_diff

    self.X = theano.shared(
      np.zeros(shape=(0, 0), dtype='float32')
    )

    self.weights = theano.shared(
      np.ones(shape=(0, ), dtype='float32')
    )

    canonical = T.sum(self.weights[:, None] * self.X, axis=0) / T.sum(self.weights)

    weights_updates = self.kernel(canonical, self.X)
    weights_diff = T.max(abs(weights_updates - self.weights))

    upd = {
      self.weights : weights_updates
    }

    self.iteration = theano.function([], weights_diff if max_diff is not None else [], updates=upd)
    self.get_canonical = theano.function([], canonical)
Example #7
0
 def _build_conditional(self, Xnew, pred_noise, diag, X, Xu, y, sigma, cov_total, mean_total):
     sigma2 = tt.square(sigma)
     Kuu = cov_total(Xu)
     Kuf = cov_total(Xu, X)
     Luu = cholesky(stabilize(Kuu))
     A = solve_lower(Luu, Kuf)
     Qffd = tt.sum(A * A, 0)
     if self.approx == "FITC":
         Kffd = cov_total(X, diag=True)
         Lamd = tt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2
     else:  # VFE or DTC
         Lamd = tt.ones_like(Qffd) * sigma2
     A_l = A / Lamd
     L_B = cholesky(tt.eye(Xu.shape[0]) + tt.dot(A_l, tt.transpose(A)))
     r = y - mean_total(X)
     r_l = r / Lamd
     c = solve_lower(L_B, tt.dot(A, r_l))
     Kus = self.cov_func(Xu, Xnew)
     As = solve_lower(Luu, Kus)
     mu = self.mean_func(Xnew) + tt.dot(tt.transpose(As), solve_upper(tt.transpose(L_B), c))
     C = solve_lower(L_B, As)
     if diag:
         Kss = self.cov_func(Xnew, diag=True)
         var = Kss - tt.sum(tt.square(As), 0) + tt.sum(tt.square(C), 0)
         if pred_noise:
             var += sigma2
         return mu, var
     else:
         cov = (self.cov_func(Xnew) - tt.dot(tt.transpose(As), As) +
                tt.dot(tt.transpose(C), C))
         if pred_noise:
             cov += sigma2 * tt.identity_like(cov)
         return mu, stabilize(cov)
Example #8
0
 def _construct_mom_stuff(self):
     """
     Construct the cost function for the moment-matching "regularizer".
     """
     a = self.mom_mix_rate
     dist_mean = self.GN.dist_mean
     dist_cov = self.GN.dist_cov
     # Get the generated sample observations for this batch, transformed
     # linearly into the desired space for moment matching...
     X_b = T.dot(self.GN.output, self.mom_match_proj)
     # Get their mean
     batch_mean = T.mean(X_b, axis=0)
     # Get the updated generator distribution mean
     new_mean = ((1.0 - a[0]) * self.GN.dist_mean) + (a[0] * batch_mean)
     # Use the mean to get the updated generator distribution covariance
     X_b_minus_mean = X_b - new_mean
     # Whelp, I guess this line needs the cast... for some reason...
     batch_cov = T.dot(X_b_minus_mean.T, X_b_minus_mean) / T.cast(X_b.shape[0], 'floatX')
     new_cov = ((1.0 - a[0]) * self.GN.dist_cov) + (a[0] * batch_cov)
     # Get the cost for deviation from the target distribution's moments
     mean_err = new_mean - self.target_mean
     cov_err = (new_cov - self.target_cov)
     mm_cost = self.mom_match_weight[0] * \
             (T.sum(mean_err**2.0) + T.sum(cov_err**2.0))
     # Construct the updates for the running estimates of the generator
     # distribution's first and second-order moments.
     mom_updates = OrderedDict()
     mom_updates[self.GN.dist_mean] = new_mean
     mom_updates[self.GN.dist_cov] = new_cov
     return [mm_cost, mom_updates]
Example #9
0
def orthogonal_penalty(W, D, epsilon=1e-6, axis=1):
    num = T.sqr(T.sum(W * D, axis=axis))                 # n = (d^T w)^2
    den = T.sum(T.sqr(W), axis=axis) * T.sum(T.sqr(D), axis=axis)  # d = ||w||_2^2 * ||d||_2^2
    cos = num / den                                      # c = n / d
    value = cos - (epsilon**2)                           # v = c - epsilon^2
    hinge = value * (value > 0)                          # h = [ v ]_+
    return T.sum(hinge)
def smooth_softmax(x):
    """Softmax that shouldn't overflow, with Laplacish smoothing."""
    eps = 0.0001
    e_x = T.exp(x - T.max(x, axis=1, keepdims=True))
    p = (e_x / T.sum(e_x, axis=1, keepdims=True)) + constFX(eps)
    p_sm = p / T.sum(p, axis=1, keepdims=True)
    return p_sm
def test_pickle_unpickle_without_reoptimization():
    mode = theano.config.mode
    if mode in ["DEBUG_MODE", "DebugMode"]:
        mode = "FAST_RUN"
    x1 = T.fmatrix('x1')
    x2 = T.fmatrix('x2')
    x3 = theano.shared(numpy.ones((10, 10), dtype=floatX))
    x4 = theano.shared(numpy.ones((10, 10), dtype=floatX))
    y = T.sum(T.sum(T.sum(x1**2 + x2) + x3) + x4)

    updates = OrderedDict()
    updates[x3] = x3 + 1
    updates[x4] = x4 + 1
    f = theano.function([x1, x2], y, updates=updates, mode=mode)

    # now pickle the compiled theano fn
    string_pkl = pickle.dumps(f, -1)

    # compute f value
    in1 = numpy.ones((10, 10), dtype=floatX)
    in2 = numpy.ones((10, 10), dtype=floatX)

    # test unpickle without optimization
    default = theano.config.reoptimize_unpickled_function
    try:
        # the default is True
        theano.config.reoptimize_unpickled_function = False
        f_ = pickle.loads(string_pkl)
        assert f(in1, in2) == f_(in1, in2)
    finally:
        theano.config.reoptimize_unpickled_function = default
    def eq_log_pstar_vgh(self, g_hat, h_hat, s1_hat, s0_hat, v):
        """
        Computes the expectation (under the variational distribution q(g,h)=q(g)q(h)) of the
        log un-normalized probability, i.e. log p^*(g,h,s,v)
        :param g_hat: T.matrix of shape (batch_size, n_g)
        :param h_hat: T.matrix of shape (batch_size, n_h)
        :param v    : T.matrix of shape (batch_size, n_v)
        """
        from_v = self.from_v(v)
        from_h = self.from_h(h_hat)
        from_g = self.from_g(g_hat)

        # center variables
        cg_hat = g_hat - self.cg if self.flags['center_g'] else g_hat
        ch_hat = h_hat - self.ch if self.flags['center_h'] else h_hat
        # compute expectation of various s-quantities
        s_hat  = self.s_hat(ch_hat, s1_hat, s0_hat)
        ss_hat = self.s_hat(ch_hat, s1_hat**2 + 1./self.alpha_prec,
                                    s0_hat**2 + 1./self.alpha_prec)

        lq  = 0.
        lq += T.sum(from_v * self._mu * from_h, axis=1)
        lq += T.sum(from_v * s1_hat * from_h, axis=1)
        lq -= 0.5 * T.sum(self.alpha_prec * ss_hat, axis=1)
        lq -= T.sum(0.5 * self.lambd_prec * v**2, axis=1)
        lq += T.sum(self.alpha_prec * from_g  * s_hat, axis=1)
        lq += T.dot(cg_hat, self.gbias)
        lq += T.dot(ch_hat, self.hbias)
        return T.mean(lq), [g_hat, h_hat, s_hat, ss_hat, s1_hat, s0_hat, v]
Example #13
0
 def dev_loss(self, dev_types, dev_lams, ss_ratio, y):
     su_mask = ss_ratio * T.neq(y, 0).reshape((y.shape[0], 1))
     un_mask = T.eq(y, 0).reshape((y.shape[0], 1))
     ss_mask = su_mask + un_mask
     var_fun = lambda x1, x2: T.sum(((x1 - x2) * ss_mask)**2.0) / T.sum(ss_mask)
     tanh_fun = lambda x1, x2: var_fun(T.tanh(x1), T.tanh(x2))
     norm_fun = lambda x1, x2: var_fun( \
             (x1 / T.sqrt(T.sum(x1**2.0,axis=1,keepdims=1) + 1e-6)), \
             (x2 / T.sqrt(T.sum(x2**2.0,axis=1,keepdims=1) + 1e-6)))
     sigm_fun = lambda x1, x2: var_fun(T.nnet.sigmoid(x1), T.nnet.sigmoid(x2))
     cent_fun = lambda xt, xo: T.sum(T.nnet.binary_crossentropy( \
             T.nnet.sigmoid(xo), T.nnet.sigmoid(xt))) / xt.shape[0]
     L = 0.0
     for i in xrange(self.layer_count):
         if (i < (self.layer_count - 1)):
             x1 = self.layers[i].output
             x2 = self.drop_nets[0][i].output
         else:
             x1 = self.layers[i].linear_output
             x2 = self.drop_nets[0][i].linear_output
         if (dev_types[i] == 1):
             L = L + (dev_lams[i] * norm_fun(x1, x2))
         elif (dev_types[i] == 2):
             L = L + (dev_lams[i] * tanh_fun(x1, x2))
         elif (dev_types[i] == 3):
             L = L + (dev_lams[i] * sigm_fun(x1, x2))
         elif (dev_types[i] == 4):
             L = L + (dev_lams[i] * cent_fun(x1, x2))
         else:
             L = L + (dev_lams[i] * var_fun(x1, x2))
     return L
	def __init__(self,
				 word_vec_width,
				 batch_size,
				 num_hidden,
				 learning_rate=0.1):
		self.num_hidden = num_hidden
		self.learning_rate = learning_rate
		self.word_vec_width = word_vec_width
		self.batch_size = batch_size

		self.vocab_mat = T.fmatrix('vocab')
		self.word_onehot = T.fmatrix('word_onehot')
		b = T.fvector('b')
		W = T.fmatrix('W')
		f = 1 / (1 + T.exp(-(W * (self.word_onehot.dot(self.vocab_mat) + b))))
		s = T.sum(f)

		self.exec_fn = theano.function(
			[self.word_onehot, b, W, self.vocab_mat],
			f,
			allow_input_downcast=True)

		self.word_onehot_c = T.fmatrix('word_onehot_c')
		f_c = 1 / (1 + T.exp(-(W * (self.word_onehot_c.dot(self.vocab_mat)) + b)))
		s_c = T.sum(f_c)

		J = T.largest(0, 1 - s + s_c)
		self.grad = theano.grad(J, [b, W, self.vocab_mat])

		self.grad_fn = theano.function(
			[self.word_onehot, self.word_onehot_c, b, W, self.vocab_mat],
			self.grad,
			allow_input_downcast=True)
Example #15
0
 def __init__(self, vocab_size, dim, lr=0.5):
     W = np.asarray(np.random.rand(vocab_size, dim),
                    dtype=theano.config.floatX) / float(dim)
     W1 = np.asarray((np.random.rand(vocab_size, dim)),
                     dtype=theano.config.floatX) / float(dim)
     self.W = theano.shared(W, name='W', borrow=True)
     self.W1 = theano.shared(W1, name='W1', borrow=True)
     gW = np.asarray(np.ones((vocab_size, dim)), dtype=theano.config.floatX)
     gW1 = np.asarray(
         np.ones((vocab_size, dim)), dtype=theano.config.floatX)
     self.gW = theano.shared(gW, name='gW', borrow=True)
     self.gW1 = theano.shared(gW1, name='gW1', borrow=True)
     X = T.vector()
     fX = T.vector()
     ind_W = T.ivector()
     ind_W1 = T.ivector()
     w = self.W[ind_W, :]
     w1 = self.W1[ind_W1, :]
     cost = T.sum(fX * ((T.sum(w * w1, axis=1) - X) ** 2))
     grad = T.clip(T.grad(cost, [w, w1]), -5.0, 5.0)
     updates1 = [(self.gW, T.inc_subtensor(self.gW[ind_W, :],
                                           grad[0] ** 2))]
     updates2 = [(self.gW1, T.inc_subtensor(self.gW1[ind_W1, :],
                                            grad[1] ** 2))]
     updates3 = [(self.W, T.inc_subtensor(self.W[ind_W, :],
                                          - (lr / T.sqrt(self.gW[ind_W, :])) *
                                          grad[0]))]
     updates4 = [(self.W1, T.inc_subtensor(self.W1[ind_W1, :],
                                           - (lr / T.sqrt(self.gW1[ind_W1, :])) *
                                           grad[1]))]
     updates = updates1 + updates2 + updates3 + updates4
     self.cost_fn = theano.function(
         inputs=[ind_W, ind_W1, X, fX], outputs=cost, updates=updates)
Example #16
0
def unet_crossentropy_loss_sampled(y_true, y_pred):
    print 'unet_crossentropy_loss_sampled'
    epsilon = 1.0e-4
    y_pred_clipped = T.flatten(T.clip(y_pred, epsilon, 1.0-epsilon))
    y_true = T.flatten(y_true)
    # this seems to work
    # it is super ugly though and I am sure there is a better way to do it
    # but I am struggling with theano to cooperate
    # filter the right indices
    indPos = T.nonzero(y_true)[0] # no idea why this is a tuple
    indNeg = T.nonzero(1-y_true)[0]
    # shuffle
    n = indPos.shape[0]
    indPos = indPos[srng.permutation(n=n)]
    n = indNeg.shape[0]
    indNeg = indNeg[srng.permutation(n=n)]
    # take equal number of samples depending on which class has less
    n_samples = T.cast(T.min([T.sum(y_true), T.sum(1-y_true)]), dtype='int64')

    indPos = indPos[:n_samples]
    indNeg = indNeg[:n_samples]
    loss_vector = -T.mean(T.log(y_pred_clipped[indPos])) - T.mean(T.log(1-y_pred_clipped[indNeg]))
    average_loss = T.mean(loss_vector)
    print 'average_loss:', average_loss
    return average_loss
 def KLD_X(self,m,S):
     N = m.shape[0]
     Q = m.shape[1]
     
     KL_X = T.sum(m*m)+T.sum(S-T.log(S)) - Q*N
     
     return 0.5*KL_X
Example #18
0
    def __init__(self, incoming, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.),
                 W=lasagne.init.Normal(0.05), train_g=False, init_stdv=1., nonlinearity=relu, **kwargs):
        super(WeightNormLayer, self).__init__(incoming, **kwargs)
        self.nonlinearity = nonlinearity
        self.init_stdv = init_stdv
        k = self.input_shape[1]
        if b is not None:
            self.b = self.add_param(b, (k,), name="b", regularizable=False)
        if g is not None:
            self.g = self.add_param(g, (k,), name="g", regularizable=False, trainable=train_g)
        if len(self.input_shape)==4:
            self.axes_to_sum = (0,2,3)
            self.dimshuffle_args = ['x',0,'x','x']
        else:
            self.axes_to_sum = 0
            self.dimshuffle_args = ['x',0]

        # scale weights in layer below
        incoming.W_param = incoming.W
        #incoming.W_param.set_value(W.sample(incoming.W_param.get_value().shape))
        if incoming.W_param.ndim==4:
            if isinstance(incoming, Deconv2DLayer):
                W_axes_to_sum = (0,2,3)
                W_dimshuffle_args = ['x',0,'x','x']
            else:
                W_axes_to_sum = (1,2,3)
                W_dimshuffle_args = [0,'x','x','x']
        else:
            W_axes_to_sum = 0
            W_dimshuffle_args = ['x',0]
        if g is not None:
            incoming.W = incoming.W_param * (self.g/T.sqrt(1e-6 + T.sum(T.square(incoming.W_param),axis=W_axes_to_sum))).dimshuffle(*W_dimshuffle_args)
        else:
            incoming.W = incoming.W_param / T.sqrt(1e-6 + T.sum(T.square(incoming.W_param),axis=W_axes_to_sum,keepdims=True))
    def __init__(self, numpy_rng, theano_rng=None,
                 n_ins=100,
                 layers_types=[ReLU, ReLU, ReLU, LogisticRegression],
                 layers_sizes=[1024, 1024, 1024],
                 n_outs=2,
                 rho=0.9, eps=1.E-6,
                 L1_reg=0.,
                 L2_reg=0.,
                 debugprint=False, mu=0.9):
        """
        Feedforward neural network with added L1 and/or L2 regularization.
        """
        super(RegularizedNet, self).__init__(numpy_rng, theano_rng, n_ins,
                layers_types, layers_sizes, n_outs, rho, eps, debugprint, mu)

        L1 = shared(0.)
        for param in self.params:
            L1 += T.sum(abs(param))
        if L1_reg > 0.:
            self.cost = self.cost + L1_reg * L1
        L2 = shared(0.)
        for param in self.params:
            L2 += T.sum(param ** 2)
        if L2_reg > 0.:
            self.cost = self.cost + L2_reg * L2
Example #20
0
    def __init__(self, n_in, n_out, n_hidden, activation='tanh',
                 l1_reg=0.00, l2_reg=0.00):
        BasicRNN.__init__(self, n_in, n_out, n_hidden, activation)
        bh_init = np.zeros((n_hidden,), dtype=theano.config.floatX)
        by_init = np.zeros((n_out,), dtype=theano.config.floatX)
        self.bh = theano.shared(value=bh_init, name='bh')
        self.by = theano.shared(value=by_init, name='by')
        self.params = [self.U, self.W, self.V, self.bh, self.by]

        # for every parameter, we maintain it's last update
        # the idea here is to use "momentum"
        # keep moving mostly in the same direction
        self.velocity_updates = {}
        for param in self.params:
            init = np.zeros(param.get_value(borrow=True).shape, dtype=theano.config.floatX)
            self.velocity_updates[param] = theano.shared(init)

        self.L1_reg = float(l1_reg)
        self.L2_reg = float(l2_reg)
        # L1 norm ; one regularization option is to enforce L1 norm to
        # be small
        self.L1 = 0
        self.L1 += abs(self.W.sum())
        self.L1 += abs(self.U.sum())

        # square of L2 norm ; one regularization option is to enforce
        # square of L2 norm to be small
        self.L2_sqr = 0
        self.L2_sqr += T.sum(self.W ** 2)
        self.L2_sqr += T.sum(self.U ** 2)
Example #21
0
    def get_output_for(self, input, init=False, **kwargs):
        if input.ndim > 2:
            # if the input has more than two dimensions, flatten it into a
            # batch of feature vectors.
            input = input.flatten(2)
        
        activation = T.tensordot(input, self.W, [[1], [0]])
        abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2)
                    + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1))

        if init:
            mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0)
            abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x')
            self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))]
        
        f = T.sum(T.exp(-abs_dif),axis=2)

        if init:
            mf = T.mean(f,axis=0)
            f -= mf.dimshuffle('x',0)
            self.init_updates.append((self.b, -mf))
        else:
            f += self.b.dimshuffle('x',0)

        return T.concatenate([input, f], axis=1)
	def errors(self, y, print_output=False):
		# check if y has same dimension of y_pred
		if y.ndim != self.y_pred.ndim:
			raise TypeError('y should have the same shape as self.y_pred', ('y', y.type, 'y_pred', self.y_pred.type))

		# check if y is of the correct datatype
		if y.dtype.startswith('int'):
			num_positive = T.cast(T.sum(T.eq(y,1)),'float64')
			num_predicted_positive = T.cast(T.sum(T.eq(self.y_pred,1)),'float64')
			num_correctly_predicted = T.cast(T.sum(T.eq(self.y_pred*y,1)),'float64')

			P = T.cast(0.0,'float64')	# precision  = True positive / (True positive + False positive)
			if (T.gt(num_predicted_positive,0.0)):
				P = T.cast(num_correctly_predicted / num_predicted_positive,'float64')

			R = T.cast(0.0,'float64')	# recall     = True positive / (True positive + False negative)
			if (T.gt(num_positive,0.0)):
				R = T.cast(num_correctly_predicted / num_positive,'float64')

			F1 = T.cast(0.0,'float64')	# F1 score
			if (T.gt(P+R,0.0)):
				F1 = 2.0*P*R/(P+R)

			if (print_output):
				print("      num positive = {0}".format( num_positive ) )
				print("      num predicted positive = {0}".format( num_predicted_positive ) )
				print("      num correctly predicted = {0}".format( num_correctly_predicted ) )
				print("      precision = {0}".format(P))
				print("      recall = {0}".format(R))
				print("      F1 score = {0}".format(F1))
			return [T.mean(T.neq(self.y_pred, y)), P, R, F1]

		else:
			raise NotImplementedError()
		return
Example #23
0
def build_objective(model, deterministic=False, epsilon=1e-12):
    predictions = nn.layers.get_output(model.l_out, deterministic=deterministic)
    targets = nn.layers.get_output(model.l_target)
    enable_targets = nn.layers.get_output(model.l_enable_target)
    

    sum_of_objectives = 0
    unit_ptr = 0
    for obj_idx, obj_name in enumerate(order_objectives):
        ptype = property_type[obj_name]
        if ptype == 'classification':
            num_units = len(property_bin_borders[obj_name])
            v_obj = cce(obj_idx, (unit_ptr, unit_ptr+num_units), predictions, targets, epsilon)
            # take the mean of the objectives where it matters (enabled targets)
            obj_scalar =  T.sum(enable_targets[:,obj_idx] * v_obj) / (0.00001 + T.sum(enable_targets[:,obj_idx]))
            unit_ptr = unit_ptr + num_units
        elif ptype == 'continuous':
            v_obj = sqe(obj_idx, unit_ptr, predictions, targets)
            obj_scalar =  T.mean(v_obj)
            unit_ptr += 1
        else:
            raise
        
        if deterministic:
            d_objectives_deterministic[obj_name] = obj_scalar
        else:
            d_objectives[obj_name] = obj_scalar

        sum_of_objectives += norm_weights_loss[obj_name] * obj_scalar


    return sum_of_objectives
Example #24
0
 def applyConstraint(self, param):
     if param.ndim != 4 and param.ndim != 2:
         warnings.warn("Norm constraints are normally applied to matrices"
                       +" or 4-dimensional tensors, but currently got "
                       +"%d dimensions, please make sure this is the desired"
                       +" parameter to apply norm constraints" % param.ndim)
         
     needFlip = False
     if param.ndim == 4: # a hack for conv layer filters
         prevShape = param.shape
         # conv layer filter shape is (nChannelOut, nChannelIn, r, c)
         param = param.flatten(2)
         # now it is (nout, nin), which is different from (nin, nout) 
         # from fulling connected networks, so need to flip here
         needFlip = True
     
     if needFlip:
         col_norm = T.sqrt(T.sum(T.sqr(param), axis=1, keepdims=True))
     else:
         col_norm = T.sqrt(T.sum(T.sqr(param), axis=0, keepdims=True))
         
     param /= (col_norm+1e-7)
     param *= self.norm
     
     if needFlip:
         param = param.reshape(prevShape)
                     
     return param
Example #25
0
def sequence_log_likelihood(y, y_hat, y_mask, y_hat_mask, blank_symbol, log_scale=True):
    """
    Based on code from Shawn Tan.
    Credits to Kyle Kastner as well.

    This function computes the CTC log likelihood for a sequence that has
    been augmented with blank labels.


    """
    y_hat_mask_len = tensor.sum(y_hat_mask, axis=0, dtype="int32")
    y_mask_len = tensor.sum(y_mask, axis=0, dtype="int32")

    if log_scale:
        log_probabs = _log_path_probabs(y, T.log(y_hat), y_mask, y_hat_mask, blank_symbol)
        batch_size = log_probabs.shape[1]

        # Add the probabilities of the final time steps to get the total
        # sequence likelihood.
        log_labels_probab = _log_add(
            log_probabs[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 1],
            log_probabs[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 2],
        )
    else:
        probabilities = _path_probabs(y, y_hat, y_mask, y_hat_mask, blank_symbol)
        batch_size = probabilities.shape[1]
        labels_probab = (
            probabilities[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 1]
            + probabilities[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 2]
        )
        log_labels_probab = tensor.log(labels_probab)
    return log_labels_probab
    def getRpRnTpTnForTrain0OrVal1(self, y, training0OrValidation1):
        # The returned list has (numberOfClasses)x4 integers: >numberOfRealPositives, numberOfRealNegatives, numberOfTruePredictedPositives, numberOfTruePredictedNegatives< for each class (incl background).
        # Order in the list is the natural order of the classes (ie class-0 RP,RN,TPP,TPN, class-1 RP,RN,TPP,TPN, class-2 RP,RN,TPP,TPN ...)
        # param y: y = T.itensor4('y'). Dimensions [batchSize, r, c, z]
        
        yPredToUse = self.y_pred_train if  training0OrValidation1 == 0 else self.y_pred_val
        checkDimsOfYpredAndYEqual(y, yPredToUse, "training" if training0OrValidation1 == 0 else "validation")
        
        returnedListWithNumberOfRpRnTpTnForEachClass = []
        
        for class_i in xrange(0, self._numberOfOutputClasses) :
            #Number of Real Positive, Real Negatives, True Predicted Positives and True Predicted Negatives are reported PER CLASS (first for WHOLE).
            tensorOneAtRealPos = T.eq(y, class_i)
            tensorOneAtRealNeg = T.neq(y, class_i)

            tensorOneAtPredictedPos = T.eq(yPredToUse, class_i)
            tensorOneAtPredictedNeg = T.neq(yPredToUse, class_i)
            tensorOneAtTruePos = T.and_(tensorOneAtRealPos,tensorOneAtPredictedPos)
            tensorOneAtTrueNeg = T.and_(tensorOneAtRealNeg,tensorOneAtPredictedNeg)
                    
            returnedListWithNumberOfRpRnTpTnForEachClass.append( T.sum(tensorOneAtRealPos) )
            returnedListWithNumberOfRpRnTpTnForEachClass.append( T.sum(tensorOneAtRealNeg) )
            returnedListWithNumberOfRpRnTpTnForEachClass.append( T.sum(tensorOneAtTruePos) )
            returnedListWithNumberOfRpRnTpTnForEachClass.append( T.sum(tensorOneAtTrueNeg) )
            
        return returnedListWithNumberOfRpRnTpTnForEachClass
Example #27
0
def ThangAttentionUnit(attention_state_prev, current_stack_top, premise_stack_tops, projected_stack_tops, attention_dim,
                    vs, name="attention_unit", initializer=None):
    """
    Args:
      attention_state_prev: The output of this unit at the previous time step.
      current_stack_top: The current stack top (h state only, if applicable).
      premise_stack_tops: The values to do attention over.
      projected_stack_tops: Projected vectors to use to produce an attentive
          weighting alpha_t.
      attention_dim: The dimension of the vectors over which to do attention.
      vs: A variable store for the learned parameters.
      name: An identifier for the learned parameters in this unit.
      initializer: Used to initialize the learned parameters.

    Dimension notation:
      B : Batch size
      k : Model dim
      L : num_transitions
    """
    # Shape: B x L
    score = T.sum(projected_stack_tops * current_stack_top, axis=2).T
    alpha_t = T.nnet.softmax(score)

    # Shape B x k
    Y__alpha_t = T.sum(premise_stack_tops * alpha_t.T[:, :, np.newaxis], axis=0)

    mlstm_input = T.concatenate([Y__alpha_t, current_stack_top], axis=1)

    r_t = LSTMLayer(attention_state_prev, mlstm_input, 2 * attention_dim, 2 * attention_dim, vs, name="%s/lstm" % name)

    return r_t
Example #28
0
    def finetune_cost_updates(self, center, mu, learning_rate):
        """ This function computes the cost and the updates ."""

        # note : we sum over the size of a datapoint; if we are using
        #        minibatches, L will be a vector, withd one entry per
        #        example in minibatch
        network_output = self.get_output()
        temp = T.pow(center - network_output, 2)    
        
        L =  T.sum(temp, axis=1) 
        # Add the network reconstruction error 
        z = self.get_network_reconst()
        reconst_err = T.sum(T.pow(self.x - z, 2), axis = 1)            
        L = self.beta*L + self.lbd*reconst_err
        
        cost1 = T.mean(L)
        cost2 = self.lbd*T.mean(reconst_err)  
        cost3 = cost1 - cost2

        # compute the gradients of the cost of the `dA` with respect
        # to its parameters
        gparams = T.grad(cost1, self.params)  
        # generate the list of updates
        updates = []
        grad_values = []
        param_norm = []
        for param, delta, gparam in zip(self.params, self.delta, gparams):
            updates.append( (delta, mu*delta - learning_rate * gparam) )
            updates.append( (param, param + mu*mu*delta - (1+mu)*learning_rate*gparam ))
            grad_values.append(gparam.norm(L=2))
            param_norm.append(param.norm(L=2))
        
        grad_ = T.stack(*grad_values)
        param_ = T.stack(*param_norm)
        return ((cost1, cost2, cost3, grad_, param_), updates)
Example #29
0
    def get_cost_updates(self, contraction_level, learning_rate):
        """ This function computes the cost and the updates for one trainng
        step of the cA """

        y = self.get_hidden_values(self.x)
        z = self.get_reconstructed_input(y)
        J = self.get_jacobian(y, self.W)
        # note : we sum over the size of a datapoint; if we are using
        #        minibatches, L will be a vector, with one entry per
        #        example in minibatch
        self.L_rec = - T.sum(self.x * T.log(z) +
                             (1 - self.x) * T.log(1 - z),
                             axis=1)

        # Compute the jacobian and average over the number of samples/minibatch
        self.L_jacob = T.sum(J ** 2) // self.n_batchsize

        # note : L is now a vector, where each element is the
        #        cross-entropy cost of the reconstruction of the
        #        corresponding example of the minibatch. We need to
        #        compute the average of all these to get the cost of
        #        the minibatch
        cost = T.mean(self.L_rec) + contraction_level * T.mean(self.L_jacob)

        # compute the gradients of the cost of the `cA` with respect
        # to its parameters
        gparams = T.grad(cost, self.params)
        # generate the list of updates
        updates = []
        for param, gparam in zip(self.params, gparams):
            updates.append((param, param - learning_rate * gparam))

        return (cost, updates)
Example #30
0
    def output_probabilistic(self, m_w_previous, v_w_previous):
        if (self.non_linear):
            m_in = self.m_w - m_w_previous
            v_in = self.v_w
            # We compute the mean and variance after the ReLU activation
            lam = self.lam
            v_1 = 1 + 2*lam*v_in
            v_1_inv = v_1**-1

            s_1 = T.prod(v_1,axis=1)**-0.5
            v_2 = 1 + 4*lam*v_in
            v_2_inv = v_2**-1
            s_2 = T.prod(v_2,axis=1)**-0.5
            v_inv = v_in**-1
            exponent1 = m_in**2*(1 - v_1_inv)*v_inv
            exponent1 = T.sum(exponent1,axis=1)
            exponent2 = m_in**2*(1 - v_2_inv)*v_inv
            exponent2 = T.sum(exponent2,axis=1)
            m_a = s_1*T.exp(-0.5*exponent1)
            v_a = s_2*T.exp(-0.5*exponent2) - m_a**2

            return (m_a, v_a)

        else:
            m_w_previous_with_bias = \
            T.concatenate([ m_w_previous, T.alloc(1, 1) ], 0)
            v_w_previous_with_bias = \
            T.concatenate([ v_w_previous, T.alloc(0, 1) ], 0)

            m_linear = T.dot(self.m_w, m_w_previous_with_bias) / T.sqrt(self.n_inputs)
            v_linear = (T.dot(self.v_w, v_w_previous_with_bias) + \
                T.dot(self.m_w**2, v_w_previous_with_bias) + \
                T.dot(self.v_w, m_w_previous_with_bias**2)) / self.n_inputs
            return (m_linear, v_linear)
Example #31
0
    def __theano_build__(self):
        E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c

        x_a = T.ivector('x_a')
        x_b = T.ivector('x_b')
        y = T.lvector('y')

        def forward_step(x_t, s_t_prev):
            # Word embedding layer
            x_e = E[:, x_t]
            # GRU layer 1
            z_t = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t_prev))
            r_t = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t_prev))
            c_t = T.tanh(U[2].dot(x_e) + W[2].dot(s_t_prev * r_t))
            s_t = (T.ones_like(z_t) - z_t) * c_t + z_t * s_t_prev
            # directly return the hidden state as intermidate output
            return [s_t]

        # sentence a vector (states)
        a_s, updates = theano.scan(forward_step,
                                   sequences=x_a,
                                   truncate_gradient=self.bptt_truncate,
                                   outputs_info=T.zeros(self.hidden_dim))

        # sentence b vector (states)
        b_s, updates = theano.scan(forward_step,
                                   sequences=x_b,
                                   truncate_gradient=self.bptt_truncate,
                                   outputs_info=T.zeros(self.hidden_dim))

        # semantic similarity
        # s_sim = manhattan_distance(a_s[-1],b_s[-1])

        # for classification using simple strategy
        sena = a_s[-1]
        senb = b_s[-1]

        combined_s = T.concatenate([sena, senb], axis=0)

        # softmax class
        o = T.nnet.softmax(V.dot(combined_s) + c)[0]

        # in case the o contains 0 which cause inf
        eps = np.asarray([1.0e-10] * self.label_dim,
                         dtype=theano.config.floatX)
        o = o + eps
        om = o.reshape((1, o.shape[0]))
        prediction = T.argmax(om, axis=1)
        o_error = T.nnet.categorical_crossentropy(om, y)

        # cost
        cost = T.sum(o_error)

        # updates
        updates = sgd_updates_adadelta(norm=0, params=self.params, cost=cost)

        # monitor parameter
        mV = V * T.ones_like(V)
        mc = c * T.ones_like(c)
        mU = U * T.ones_like(U)
        mW = W * T.ones_like(W)

        gV = T.grad(cost, V)
        gc = T.grad(cost, c)
        gU = T.grad(cost, U)
        gW = T.grad(cost, W)

        mgV = gV * T.ones_like(gV)
        mgc = gc * T.ones_like(gc)
        mgU = gU * T.ones_like(gU)
        mgW = gW * T.ones_like(gW)

        # Assign functions
        self.monitor = theano.function([x_a, x_b],
                                       [sena, senb, mV, mc, mU, mW])
        self.monitor_grad = theano.function([x_a, x_b, y],
                                            [mgV, mgc, mgU, mgW])
        self.predict = theano.function([x_a, x_b], om)
        self.predict_class = theano.function([x_a, x_b], prediction)
        self.ce_error = theano.function([x_a, x_b, y], cost)
        # self.bptt = theano.function([x,y],[dE,dU,dW,db,dV,dc])

        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')

        # rmsprop cache updates
        # find the nan
        self.sgd_step = theano.function(
            [x_a, x_b, y], [],
            updates=updates
            # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)
        )
    def __init__(self, layer_def, inputs, inputs_shape, rs, clone_from=None):
        """
            Create a Dirichlet layer, according to the following paper:
                Malmir M, Sikka K, Forster D, Fasel I, Movellan JR, Cottrell GW. 
                Deep Active Object Recognition by Joint Label and Action Prediction. 
                arXiv preprint arXiv:1512.05484. 2015 Dec 17. 
            Each unit in this layer encodes a Dicihlet distribution over its input.
            The input is assumed to be a belief vector, i.e. \sum_i input[i] = 1, 0 <= input_i <= 1 for all i
                 
            :type layer_def: Element, xml containing configu for Conv layer

            :type inputs: a list of [belief_in, actions, objects, previous_belief] 
            :param inputs[0], belief_in, is a theano.matrix which contains belief vectors in its columns
            :param inputs[1], actions, theano.ivector, list of actions for each column of belief_in 
            :param inputs[2], objects, theano.ivector, list of objects for each column of belief_in 
            :param inputs[3], previous_belief, theano.matrix, used to accumulate beliefs over time 
            
            
            :type input_shapes: list of sizes of inputs 

            :type rs: a random number generator used to initialize weights
        """
        assert (
            len(inputs) == 4
        )  #belief dim x bacth_sz, actions: 1 x batch_size, objects 1 x batch_sz, accbelief (numActs*numObjs) x batch_sz
        beliefs, actions, objects, accbeliefs = inputs
        self.inputs = inputs  # beliefs, actions, objects
        dim = inputs_shape[0][0]
        assert (inputs_shape[0][1] == inputs_shape[1][1])  #batch_size
        assert (inputs_shape[0][1] == inputs_shape[2][1])  #batch_size
        assert (inputs_shape[0][1] == inputs_shape[3][1])  #batch_size
        assert (inputs_shape[1][0] == 1)  #action is a single integer
        assert (inputs_shape[2][0] == 1)  #object label is a single integer
        batch_size = inputs_shape[0][1]
        self.numActions = int(layer_def.find("numActions").text)
        self.numObjects = int(layer_def.find("numObjects").text)
        assert (self.numObjects * self.numActions == inputs_shape[3][0])
        assert (self.numObjects == dim)
        #total number of dirichlet units = numActions x numObjects
        num_dirichlets = self.numObjects * self.numActions
        if clone_from == None:
            self.alphas = theano.shared(np.random.randint(
                5, 30, [dim, num_dirichlets]).astype(theano.config.floatX) /
                                        25.,
                                        borrow=True)  # dim x num_dirichlets
        else:
            self.alphas = clone_from.alphas
        #self.alphas         = theano.shared(0.7* np.ones([dim,num_dirichlets]).astype(theano.config.floatX),borrow=True)# dim x num_dirichlets
        #remove 0 from the input belief
        normalized_beliefs = beliefs + 1.e-6
        normalized_beliefs = normalized_beliefs / T.reshape(
            T.sum(normalized_beliefs, axis=0), [1, batch_size])
        log_normed_beliefs = T.log(normalized_beliefs)  # dim x batch_size
        self.log_normed = log_normed_beliefs

        #calculate Dirichlet probs for the current normalize beliefs
        self.term1 = T.reshape(T.gammaln(T.sum(self.alphas, axis=0)),
                               [num_dirichlets, 1])
        self.term2 = T.reshape(T.sum(T.gammaln(self.alphas), axis=0),
                               [num_dirichlets, 1])
        self.term3 = T.dot(T.transpose(self.alphas - 1.),
                           log_normed_beliefs)  # num_dirichlets x batch_size
        #find a mask based on the actions
        dirichlet_actions = np.tile(
            np.arange(self.numActions).reshape([-1, 1]), [self.numObjects, 1])
        dirichlet_actions = np.tile(dirichlet_actions, [1, batch_size])
        dirichlet_actions = theano.shared(dirichlet_actions.astype(
            theano.config.floatX),
                                          borrow=True)
        in_actions = T.tile(T.reshape(actions, [1, batch_size]),
                            [num_dirichlets, 1])
        self.eq_actions = T.eq(dirichlet_actions, in_actions)
        #self.current_belief = T.exp(self.term1 - self.term2 + self.eq_actions * self.term3) #this should be normalized for each column
        log_cur_belief = self.term1 - self.term2 + self.eq_actions * self.term3  #this should be normalized for each column
        #log_cur_belief      = self.term1 - self.term2 + self.term3 #this should be normalized for each column
        log_cur_belief_normd = log_cur_belief - T.reshape(
            T.max(log_cur_belief, axis=0), [1, batch_size])
        cur_blf = self.eq_actions * T.exp(log_cur_belief_normd)
        self.current_belief = cur_blf / T.sum(cur_blf, axis=0)

        acc_is_zero = T.eq(accbeliefs, 0.)
        accbeliefs_no_0 = acc_is_zero + (1. - acc_is_zero) * accbeliefs
        updated_belief = self.eq_actions * self.current_belief * accbeliefs_no_0 + (
            1. - self.eq_actions) * accbeliefs  # num_dirichlet x batch_size
        sum_up_blf = T.reshape(T.sum(updated_belief, axis=0), [1, batch_size])
        #sum_up_blf_normed   = T.switch( T.eq(sum_up_blf, 0.) , np.ones([1,batch_size]).astype(theano.config.floatX),sum_up_blf)
        #self.updated_belief = updated_belief / sum_up_blf_normed
        self.updated_belief = updated_belief / sum_up_blf
        self.output = self.updated_belief
        #self.updated_belief = self.current_belief

        #construct the outputs
        # for each class, assign 1s to the components that indicate P(a,o|x)
        #weights_marginalize = np.zeros([self.numObjects,num_dirichlets],dtype=theano.config.floatX)
        #for i in range(self.numObjects):
        #    weights_marginalize[i,i*self.numActions:(i+1)*self.numActions] = 1.
        #weights_margin      = theano.shared( weights_marginalize , borrow=True)
        #self.output         = T.dot( weights_margin, self.updated_belief)

        #calculating weight updates
        objects_idx = np.tile(
            np.arange(self.numObjects).reshape([-1, 1]),
            [1, self.numActions]).reshape([1, -1])
        objects_idx = np.tile(objects_idx.reshape([-1, 1]),
                              [1, batch_size])  # num_dirichlets x batch_size
        objects_idx = theano.shared(objects_idx.astype(theano.config.floatX),
                                    borrow=True)
        in_objects = T.tile(T.reshape(objects, [1, batch_size]),
                            [num_dirichlets, 1])  # num_dirichlets x batch_size
        self.idx = self.eq_actions * T.eq(
            objects_idx, in_objects)  # num_dirichlets x batch_size
        self.idx = self.idx.astype(theano.config.floatX)
        self.N = T.reshape(T.sum(self.idx, axis=1), [1, num_dirichlets])
        #take care of 0 in the input to avoid nan in log
        term5 = T.dot(log_normed_beliefs,
                      T.transpose(self.idx))  #dim x num_dirichlets
        self.update = self.N * T.reshape(T.psi(T.sum(self.alphas, axis=0)),
                                         [1, num_dirichlets]) - self.N * T.psi(
                                             self.alphas) + term5
        #self.update         = T.psi(self.alphas) + term5

        #calculate log-prob of data                ndirichlets                    ndirichlets
        dir_l_p = self.N * T.gammaln(T.sum(
            self.alphas, axis=0)) - self.N * T.sum(
                T.gammaln(self.alphas), axis=0) + T.sum(
                    term5 * (self.alphas - 1.), axis=0)
        self.log_p_ao = T.mean(dir_l_p)

        self.params = [self.alphas]
        self.inputs_shape = inputs_shape
        #self.output_shape   = [dim,batch_size]
        self.output_shape = [num_dirichlets, batch_size]
Example #33
0
    def __init__(self, num_actions):

        # remember parameters
        self.num_actions = num_actions

        # batch size is T_MAX now
        self.batch_size = T_MAX  #BATCH_SIZE

        self.discount_rate = DISCOUNT_RATE

        self.history_length = HISTORY_LENGTH
        self.screen_dim = DIMS
        self.img_height = SCREEN_HEIGHT
        self.img_width = SCREEN_WIDTH

        self.beta = BETA

        self.learning_rate = LEARNING_RATE
        self.rms_decay = RMS_DECAY
        self.rms_epsilon = RMS_EPSILON

        # prepare tensors once and reuse them
        state = T.tensor4('state')
        reward = T.fvector('reward')
        advantage = T.fvector('advantage')
        action = T.ivector('action')

        #beta = T.fscalar('regularization_rate')
        # set learning rate
        #self.shared_beta = theano.shared(np.zeros((1)), dtype=theano.config.floatX ,
        #        broadcastable=(True))
        #self.shared_beta.set_value([BETA])

        # create shared theano variables
        self.state_shared = theano.shared(
            np.zeros((self.batch_size, self.history_length, self.img_height,
                      self.img_width),
                     dtype=theano.config.floatX))

        self.reward_shared = theano.shared(
            np.zeros((self.batch_size), dtype=theano.config.floatX))

        self.advantage_shared = theano.shared(
            np.zeros((self.batch_size), dtype=theano.config.floatX))

        self.action_shared = theano.shared(
            np.zeros((self.batch_size), dtype='int32'))

        # can add multiple nets here
        # Shared network parameters here
        self.shared_net = self.build_shared_network()
        shared_out = lasagne.layers.get_output(self.shared_net, state)

        ####### OPTIMIZATION here --------------
        # Policy network parameters here
        self.policy_network = self.build_policy_network()
        policy_out = lasagne.layers.get_output(self.policy_network, shared_out)

        # Value network parameters here
        self.value_network = self.build_value_network()
        value_out = lasagne.layers.get_output(self.value_network, shared_out)

        ## ----------------------- LOSS FUNCTION SHIT STARTS HERE ----------------------------------------

        N = state.shape[0]

        # take log policy loss
        policy_loss = -T.log(policy_out[
            T.arange(N), self.action_shared]) * self.advantage_shared

        # take entropy and add with the regularizer
        entropy = -T.sum(-policy_out * T.log(policy_out), axis=1)

        # add regullazrization
        policy_loss += self.beta * entropy

        #policy_loss = T.sum(policy_loss)

        # get the value loss
        value_loss = (
            (self.reward_shared - T.reshape(value_out,
                                            (self.batch_size, )))**2) / 2
        #value_loss = T.sum(value_loss)

        total_loss = T.sum(policy_loss + (0.5 * value_loss))

        ## ----------------------- LOSS FUNCTION SHIT ENDS HERE ----------------------------------------

        shared_params = lasagne.layers.helper.get_all_params(self.shared_net)
        only_policy_params = lasagne.layers.helper.get_all_params(
            self.policy_network)
        only_value_params = lasagne.layers.helper.get_all_params(
            self.value_network)

        policy_params = shared_params + only_policy_params
        value_params = shared_params + only_value_params

        g_time = time.time()
        logger.info("graph compiling")

        # get grads here
        policy_grad = T.grad(total_loss, policy_params)
        value_grad = T.grad(total_loss, value_params)

        # there'll be two kind of updates
        policy_updates = rmsprop_updates(policy_grad, policy_params,
                                         self.learning_rate, self.rms_decay,
                                         self.rms_epsilon)

        value_updates = rmsprop_updates(value_grad, value_params,
                                        self.learning_rate, self.rms_decay,
                                        self.rms_epsilon)

        givens = {
            state: self.state_shared,
            reward: self.reward_shared,
            action: self.action_shared,
            advantage: self.advantage_shared,
        }

        # theano functions for accumulating the grads
        self._policy_grad = theano.function([], policy_grad, givens=givens)
        self._value_grad = theano.function([], value_grad, givens=givens)

        # train will take input the grads and just apply them

        # NEEDS work here ------------
        self._train_policy = theano.function([], [],
                                             updates=policy_updates,
                                             givens=givens)
        self._train_value = theano.function([], [],
                                            updates=value_updates,
                                            givens=givens)

        # get output for a state
        self._policy = theano.function([],
                                       policy_out,
                                       givens={state: self.state_shared})
        self._value = theano.function([],
                                      value_out,
                                      givens={state: self.state_shared})

        # need more theano functions for getting policy and value
        logger.info("Theano Graph Compiled !! %f", time.time() - g_time)
Example #34
0
    def get_detections(self, model, data_x, data_m, params):

        pr_threshold = params.get("prThreshold", 0.01)
        nms_threshold = params.get("nmsThreshold", 0.5)
        corner_threshold = params.get("cornerThreshold", self.sparse_layer.corner_threshold)
        corner_max = params.get("cornerMax", 1024)
        use_soft_nms = params.get("useSoftNMS", 0) == 1
        t = (pr_threshold, nms_threshold, corner_threshold, corner_max)
        logging.verbose("Using detection params - pr threshold: %f, nms threshold: %f, corner_threshold: %f, corner_max: %i"%t)

        first_detect = False
        if self.detect_func is None:

            #get all model outputs
            outputs=[]

            if self.use_jointfit:
                det_fit = self.det_pr
                det_fit_null = det_fit[:, self.null_class, :, :]
                det_fit = det_fit[:,:self.class_num*self.fitness_num, :, :]
                det_fit = det_fit.reshape((self.batch_size, self.class_num, self.fitness_num, self.sample_num, self.sample_num))
                det_fit_pr = tensor.exp(det_fit)
                
                m = tensor.max(det_fit, axis=2)
                det_pr = m + tensor.log(tensor.sum(tensor.exp(det_fit - m[:,:,None,:,:]), axis=2))
                det_pr = tensor.concatenate([det_pr, det_fit_null[:,None,:,:]], axis=1)
                outputs.append(det_pr)

                val = [self.overlap_threshold[0] + i*(1.0 - self.overlap_threshold[0])/self.fitness_num for i in range(self.fitness_num)]
                fitness_val = theano.shared(numpy.array(val, dtype=numpy.float32))
                fitness = tensor.log(tensor.sum(det_fit_pr*fitness_val[None,None,:,None,None], axis=2))
                outputs.append(fitness)
            else:
                outputs.append(self.det_pr)

            if self.use_bbox_reg:
                outputs.append(self.bbox_predict)

            if self.use_indfit:
                outputs.append(tensor.exp(self.indfit_pr))

            logging.info("Building detection function")
            self.detect_func = theano.function([model.input], outputs, givens=[(get_train(), tensor.cast(0, 'int8'))], on_unused_input='ignore')

            logging.verbose("Exporting graph...")
            with open("detect_graph.txt", "w") as f:
                theano.printing.debugprint(self.detect_func, file=f, print_type=True)
            
            first_detect = True

        #get sampling bounding boxs
        logging.verbose("Detecting sample bboxs (%.2f)"%corner_threshold)
        timer = common.Timer()
        sample_bboxs = self.sparse_layer.get_samples(data_x, train=False, store_shared=True)
        timer.mark()
        logging.verbose("Found sample bboxs: {}".format([len(bbox) for bbox in sample_bboxs]))

        #upload sampling bounding boxs
        bboxs = self.sparse_layer.set_samples(sample_bboxs)
        timer.mark()

        #classify sampling bounding boxs
        r = list(self.detect_func(data_x))

        #get outputs
        if self.use_jointfit:
            det_pr = r[0]
            fitness = r[1]
            r_index = 2
        else:
            det_pr = r[0]
            fitness = numpy.copy(det_pr)
            r_index = 1
            
        if self.use_bbox_reg:
            bboxs = r[r_index]
            r_index += 1
        else:
            bboxs = self.sparse_layer.get_bbox_array(sample_bboxs)

            
        if self.use_indfit:
            indfit_pr = r[r_index]
            fitness_val = numpy.array([0.0] + [self.overlap_threshold[0] + i * (1.0 - self.overlap_threshold[0])/(self.fitness_num-1) for i in range(self.fitness_num-1)])
            fitness_exp = numpy.sum(indfit_pr*fitness_val[None,:,None,None], axis=1).astype(numpy.float32)
            fitness += numpy.log(fitness_exp)[:,None,:,:]
            r_index += 1
                

        timer.mark()
        sample_bbox_num = [len(s) for s in sample_bboxs]
        detlists = c_code.build_detections_nms(pr_threshold, nms_threshold, use_soft_nms, det_pr, fitness, bboxs, sample_bbox_num)
        timer.mark()

        logging.verbose("Found detections:", [len(detlist) for detlist in detlists])
        logging.verbose("FPS=%.1f, Timing (ms) - get samples: %i, upload: %i, classify: %i, build+nms %i"%tuple([self.batch_size / timer.current()] + timer.deltas_ms()))

        if not first_detect:
            global detect_time, detect_num
            detect_time += timer.current()
            detect_num += self.batch_size
            logging.info("Average FPS=%.1f"%(detect_num / detect_time))

        #results format
        results=[]
        for i, detlist in enumerate(detlists):
            results.append({"detections":detlist, "meta":data_m[i]})

        return results
Example #35
0
    def get_errors(self, yt_index, yt_value):

        #unpack indexs and values
        shapes = [self.det_shape]
        if self.use_bbox_reg:
            shapes += [(self.batch_size, self.sample_num, self.sample_num), (self.batch_size, 8, self.sample_num, self.sample_num)]
        if self.use_indfit:
            shapes += [self.indfit_shape]
            
        v = common.ndarray_unpack(yt_value, shapes)
        det_pr = v[0]
        index = 1
        if self.use_bbox_reg:
            bbox_valid, bbox_reg = v[index:(index+2)]
            index += 2
        if self.use_indfit:
            indfit_pr = v[index:(index+1)]

        #Detection Cost:
        det_errors = -tensor.sum(det_pr*self.det_pr, axis=1) / math.log(self.det_shape[1])

        #Bounding Box Regression Cost:
        bbox_errors = None
        if self.use_bbox_reg and self.bbox_factor > 0.0:
            bbox_target = bbox_reg[:,0:4,...]
            bbox_sample = bbox_reg[:,4:8,...]
            
            bbox_errors = tensor.zeros((self.batch_size, self.sample_num, self.sample_num), dtype=numpy.float32)
            if self.use_bounded_iou:
                target_x = bbox_target[:,0,:,:]
                target_y = bbox_target[:,1,:,:]
                target_w = bbox_target[:,2,:,:]
                target_h = bbox_target[:,3,:,:]
                predict_x = 0.5*(self.bbox_predict[:,:,:,0] + self.bbox_predict[:,:,:,2])
                predict_y = 0.5*(self.bbox_predict[:,:,:,1] + self.bbox_predict[:,:,:,3])
                predict_w = self.bbox_predict[:,:,:,2] - self.bbox_predict[:,:,:,0]
                predict_h = self.bbox_predict[:,:,:,3] - self.bbox_predict[:,:,:,1]
                
                dx = target_x - predict_x
                dy = target_y - predict_y
                eps = 0.001

                #ORIGINAL Paper used 4*dx, proper implementation is 2*dx
                cost_x = tensor.switch(dx >= 0.0, 2*dx / (target_w + dx + eps), -2*dx / (target_w - dx + eps))
                cost_y = tensor.switch(dy >= 0.0, 2*dy / (target_h + dy + eps), -2*dy / (target_h - dy + eps))
                cost_w = 1.0 - tensor.minimum(target_w / (predict_w + eps), predict_w / (target_w + eps))
                cost_h = 1.0 - tensor.minimum(target_h / (predict_h + eps), predict_h / (target_h + eps))
                cost = tensor.concatenate([cost_x[:,None,:,:], cost_y[:,None,:,:], cost_w[:,None,:,:], cost_h[:,None,:,:]], axis=1)
                bbox_errors += self.bbox_factor*bbox_valid*tensor.sum(theano_util.smooth_L1(cost), axis=1)
            else:
                #standard Fast R-CNN style cost
                tx = (bbox_target[:, 0, ...] - bbox_sample[:, 0, ...]) / bbox_sample[:, 2, ...]
                ty = (bbox_target[:, 1, ...] - bbox_sample[:, 1, ...]) / bbox_sample[:, 3, ...]
                tw = tensor.log(bbox_target[:, 2, ...] / bbox_sample[:, 2, ...])
                th = tensor.log(bbox_target[:, 3, ...] / bbox_sample[:, 3, ...])
                t = tensor.concatenate([tx[:,None, ...], ty[:,None, ...], tw[:,None, ...], th[:,None, ...]], axis=1)
                dt = t - self.bbox_reg
                bbox_errors += self.bbox_factor*bbox_valid*tensor.sum(theano_util.smooth_L1(dt), axis=1)

        indfit_errors = None
        if self.use_indfit:
            indfit_errors = -tensor.sum(indfit_pr*self.indfit_pr, axis=1) / math.log(self.fitness_num)
                
        return det_errors, bbox_errors, indfit_errors
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size, config.embed_size, name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)
        qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX),
                                                     config.question_lstm_size, config.question_skip_connections, 'q')
        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2*sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1)
        else:
            qenc_dim = 2*config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        cqembed = tensor.concatenate([cembed, tensor.extra_ops.repeat(qenc[None, :, :], cembed.shape[0], axis=0)], axis=2)
        clstms, chidden_list = make_bidir_lstm_stack(cqembed, config.embed_size + qenc_dim, context_mask.astype(theano.config.floatX),
                                                     config.ctx_lstm_size, config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2*sum(config.ctx_lstm_size) #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2*config.question_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism MLP start
        attention_mlp_start = MLP(dims=config.attention_mlp_hidden + [1],
                            activations=config.attention_mlp_activations[1:] + [Identity()],
                            name='attention_mlp_start')
        attention_qlinear_start = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq_start') #Wum
        attention_clinear_start = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc_start') # Wym
        bricks += [attention_mlp_start, attention_qlinear_start, attention_clinear_start]
        layer1_start = Tanh(name='layer1_start')
        layer1_start = layer1_start.apply(attention_clinear_start.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2])))
                                        .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0]))
                             + attention_qlinear_start.apply(qenc)[None, :, :])
        att_weights_start = attention_mlp_start.apply(layer1_start.reshape((layer1_start.shape[0]*layer1_start.shape[1], layer1_start.shape[2])))
        att_weights_start = att_weights_start.reshape((layer1_start.shape[0], layer1_start.shape[1]))
        att_weights_start = tensor.nnet.softmax(att_weights_start.T).T

        attended = tensor.sum(cenc * att_weights_start[:, :, None], axis=0)
        attended.name = 'attended'

        # Attention mechanism MLP end
        attention_mlp_end = MLP(dims=config.attention_mlp_hidden + [1],
                            activations=config.attention_mlp_activations[1:] + [Identity()],
                            name='attention_mlp_end')
        attention_qlinear_end = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq_end') #Wum
        attention_clinear_end = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc_end') # Wym
        bricks += [attention_mlp_end, attention_qlinear_end, attention_clinear_end]
        layer1_end = Tanh(name='layer1_end')
        layer1_end = layer1_end.apply(attention_clinear_end.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2])))
                                        .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0]))
                             + attention_qlinear_end.apply(attended)[None, :, :])
        att_weights_end = attention_mlp_end.apply(layer1_end.reshape((layer1_end.shape[0]*layer1_end.shape[1], layer1_end.shape[2])))
        att_weights_end = att_weights_end.reshape((layer1_end.shape[0], layer1_end.shape[1]))
        att_weights_end = tensor.nnet.softmax(att_weights_end.T).T

        att_weights_start = tensor.dot(tensor.le(tensor.tile(theano.tensor.arange(context.shape[0])[None,:], (context.shape[0], 1)),
                                         tensor.tile(theano.tensor.arange(context.shape[0])[:,None], (1, context.shape[0]))), att_weights_start)
        att_weights_end = tensor.dot(tensor.ge(tensor.tile(theano.tensor.arange(context.shape[0])[None,:], (context.shape[0], 1)),
                                       tensor.tile(theano.tensor.arange(context.shape[0])[:,None], (1, context.shape[0]))), att_weights_end)

        # add attention from left and right
        #att_weights = att_weights_start * att_weights_end
        att_weights = tensor.minimum(att_weights_start, att_weights_end)

        att_target = tensor.eq(tensor.tile(answer[None,:,:], (context.shape[0], 1, 1)),
                               tensor.tile(context[:,None,:], (1, answer.shape[0], 1))).sum(axis=1).clip(0,1)

        self.predictions = tensor.gt(att_weights, 0.5) * context

        att_target = att_target / (att_target.sum(axis=0) + 0.00001)
        att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001)

        cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum()

        # Apply dropout
        cg = ComputationGraph([cost])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'
        att_weights_start.name = 'att_weights_start'
        att_weights_end.name = 'att_weights_end'
        att_target.name = 'att_target'
        att_weights.name = 'att_weights'
        self.predictions.name = 'pred'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]
        self.analyse_vars= [cost, self.predictions, att_weights_start, att_weights_end, att_weights, att_target]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
Example #37
0
def test_batch_normalization_train_broadcast():
    for axes in ('per-activation', 'spatial', (1, 2, 3, 4)):
        for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
            x = vartype('x')
            ndim = x.ndim
            eps = 5e-3  # some non-standard value to test if it's used
            running_average_factor = 0.3

            # remove non-existing axes
            if isinstance(axes, tuple):
                axes = tuple(i for i in axes if i < ndim)
            if len(axes) == 0:
                continue

            # convert axes to explicit list
            if axes == 'per-activation':
                axes2 = (0, )
            elif axes == 'spatial':
                axes2 = (0, ) + tuple(range(2, ndim))
            else:
                axes2 = axes

            # compute axes for parameter tensors
            non_bc_axes = tuple(i for i in range(ndim) if i not in axes2)
            params_dimshuffle = ['x'] * ndim
            for i, axis in enumerate(non_bc_axes):
                params_dimshuffle[axis] = i

            # construct non-broadcasted parameter variables
            param_type = T.TensorType(x.dtype, (False, ) * len(non_bc_axes))
            scale, bias, running_mean, running_var = (param_type(n)
                                                      for n in ('scale',
                                                                'bias',
                                                                'running_mean',
                                                                'running_var'))

            # broadcast parameter variables
            scale_bc = scale.dimshuffle(params_dimshuffle)
            bias_bc = bias.dimshuffle(params_dimshuffle)
            running_mean_bc = running_mean.dimshuffle(params_dimshuffle)
            running_var_bc = running_var.dimshuffle(params_dimshuffle)

            # batch_normalization_train with original, non-broadcasted variables
            train_non_bc = \
                bn.batch_normalization_train(
                    x, scale, bias, axes, eps,
                    running_average_factor, running_mean, running_var)
            # batch_normalization_train with broadcasted variables
            train_bc = \
                bn.batch_normalization_train(
                    x, scale_bc, bias_bc, axes, eps,
                    running_average_factor, running_mean_bc, running_var_bc)
            train_bc = tuple([train_bc[0]] +  # out
                             [r.dimshuffle(non_bc_axes) for r in train_bc[1:]])

            # batch_normalization_test with original, non-broadcasted variables
            test_non_bc = \
                bn.batch_normalization_test(
                    x, scale, bias, running_mean, running_var, axes, eps)
            # batch_normalization_test with broadcasted variables
            test_bc = \
                bn.batch_normalization_test(
                    x, scale_bc, bias_bc, running_mean_bc, running_var_bc, axes, eps)

            # subtract the results of the non-broadcasted and broadcasted calls
            results_non_bc = train_non_bc + (test_non_bc, )
            results_bc = train_bc + (test_bc, )
            results = [
                abs(r - r_bc) for (r, r_bc) in zip(results_non_bc, results_bc)
            ]

            # compile to compute all differences
            f = theano.function([x, scale, bias, running_mean, running_var],
                                T.sum(sum(results)))

            # the paired ops are exactly the same, so the optimizer should have
            # collapsed the sum of differences to a constant zero
            nodes = f.maker.fgraph.toposort()
            if theano.config.mode != "FAST_COMPILE":
                assert len(nodes) == 1
                assert isinstance(nodes[0].op, theano.compile.DeepCopyOp)
            inputs = [
                np.asarray(np.random.rand(*((4, ) * n)), x.dtype) for n in [
                    x.ndim, scale.ndim, bias.ndim, running_mean.ndim,
                    running_var.ndim
                ]
            ]
            assert 0.0 == f(*inputs)
Example #38
0
def hybrid_loss(y, t):
    log_loss = categorical_crossentropy(y, t).mean()
    kappa_loss = quad_kappa_loss(y, t, y_pow=2)
    return kappa_loss + 0.5 * T.clip(log_loss, 0.6, 10**3)


def discrete_predict(predictions):
    return T.round(T.clip(predictions, 0, 4))


predictions = nn.layers.get_output(output_layer, deterministic=False)
train_log_loss, train_reg_loss, train_multi_loss = multi_task_loss(
    predictions, y)
params = nn.layers.get_all_params(output_layer, regularizable=True)
regularization = sum(T.sum(p**2) for p in params)
train_loss = train_multi_loss + l2_reg * regularization
train_accuracy = accuracy(predictions[:, :num_class], y)
train_kappa = quad_kappa(predictions[:, :num_class], y)

valid_predictions = nn.layers.get_output(output_layer, deterministic=True)
valid_log_loss, valid_reg_loss, valid_multi_loss = multi_task_loss(
    valid_predictions, y)
valid_accuracy = accuracy(valid_predictions[:, :num_class], y)
valid_kappa = quad_kappa(valid_predictions[:, :num_class], y)

# Scale grads
all_params = nn.layers.get_all_params(output_layer, trainable=True)
all_grads = T.grad(train_loss, all_params)
#scaled_grads = nn.updates.total_norm_constraint(all_grads, max_norm=5, return_norm=False)
def GESD (sum_uni_l, sum_uni_r):
    eucli=1/(1+T.sum((sum_uni_l-sum_uni_r)**2))
    kernel=1/(1+T.exp(-(T.dot(sum_uni_l,sum_uni_r.T)+1)))
    return (eucli*kernel).reshape((1,1))   
Example #40
0
    #theano#################################################################################################################
    #check forward
    fo = theano.function([X, W_x, W_h, B, hid], o)
    print"x=", v_x
    print"w_x=",v_w_x
    print"w_h=", v_w_h
    print"b=", v_b
    print"b_mkl=", v_b_mkl
    print"hid=",v_hid
    print"o_real=",v_o_real

    v_o = fo(v_x, v_w_x, v_w_h, v_b, v_hid)
    #print "forward o=",v_o

    #check gradients
    loss = -T.sum(o * T.log(o_real))
    gx = T.grad(loss, X)
    fx = theano.function([X, W_x, W_h, B, hid, o_real], gx)
    #theano.printing.pydotprint(fx, outfile='rnn_dx.png', var_with_name_simple=True)
    gradients_x = fx(v_x, v_w_x, v_w_h, v_b, v_hid, v_o_real)
    print "gradients_x=", gradients_x

    #mkl#################################################################################################################
    O = mkl_simplernn_bw_op.SimpleRNN_bw()(X, W_x, W_h, B_mkl, hid, o_real)
    fx_mkl = theano.function([X, W_x, W_h, B_mkl, hid, o_real], O)
    gradients_x_mkl = fx_mkl(v_x, v_w_x, v_w_h, v_b_mkl, v_hid, v_o_real)
    theano.printing.pydotprint(fx_mkl, outfile='rnn_dx.png', var_with_name_simple=True)
    
    print "gradients_x_mkl=", gradients_x_mkl
def evaluate_lenet5(learning_rate=0.008, n_epochs=2000, nkerns=[400], batch_size=1, window_width=3,
                    maxSentLength=30, emb_size=300, hidden_size=[300,10],
                    margin=0.5, L2_weight=0.0001, Div_reg=0.0001, norm_threshold=5.0, use_svm=False):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/';
    rng = numpy.random.RandomState(23455)
    datasets, word2id=load_msr_corpus_20161229(rootPath+'tokenized_train.txt', rootPath+'tokenized_test.txt', maxSentLength)
    vocab_size=len(word2id)+1
    mtPath='/mounts/data/proj/wenpeng/Dataset/paraphraseMT/'
    mt_train, mt_test=load_mts(mtPath+'concate_15mt_train.txt', mtPath+'concate_15mt_test.txt')
    wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_number_matching_scores.txt', rootPath+'test_number_matching_scores.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2]
    indices_train_r=indices_train[1::2]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1]
    indices_test_l=indices_test[::2]
    indices_test_r=indices_test[1::2]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    train_size = len(indices_train_l)
    test_size = len(indices_test_l)
    
    train_batch_start=range(train_size)
    test_batch_start=range(test_size)

    
#     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
#     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
#     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
#     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
#     indices_train_l=T.cast(indices_train_l, 'int32')
#     indices_train_r=T.cast(indices_train_r, 'int32')
#     indices_test_l=T.cast(indices_test_l, 'int32')
#     indices_test_r=T.cast(indices_test_r, 'int32')
    


    rand_values=random_value_normal((vocab_size, emb_size), theano.config.floatX, rng)
#     rand_values[0]=numpy.array(numpy.zeros(emb_size))
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_word2vec()
    rand_values=load_word2vec_to_init_new(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=numpy.array(rand_values,dtype=theano.config.floatX), borrow=True)#theano.shared(value=rand_values, borrow=True)      
    

    
    # allocate symbolic variables for the data
#     index = T.iscalar()
    x_index_l = T.imatrix()   # now, x is the index matrix, must be integer
    x_index_r = T.imatrix()
    y = T.ivector()  
    left_l=T.iscalar()
    right_l=T.iscalar()
    left_r=T.iscalar()
    right_r=T.iscalar()
    length_l=T.iscalar()
    length_r=T.iscalar()
    norm_length_l=T.fscalar()
    norm_length_r=T.fscalar()
    mts=T.fmatrix()
    wmf=T.fmatrix()
#     cost_tmp=T.fscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1)
    layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1)
    
    
    conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]))
    conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3]))
    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0_l = Conv_with_input_para(rng, input=layer0_l_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_r = Conv_with_input_para(rng, input=layer0_r_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output')
    layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output')
    layer0_l_output_maxpool = T.max(layer0_l.output_narrow_conv_out[:,:,:,left_l:], axis=3).reshape((1, nkerns[0]))
    layer0_r_output_maxpool = T.max(layer0_r.output_narrow_conv_out[:,:,:,left_r:], axis=3).reshape((1, nkerns[0]))
    
    layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0],
                                       left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, 
                                       length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1,
                                       dim=maxSentLength+filter_size[1]-1)
    

    
    
    
    
    
    sum_uni_l=T.sum(layer0_l_input[:,:,:,left_l:], axis=3).reshape((1, emb_size))
    norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    sum_uni_r=T.sum(layer0_r_input[:,:,:,left_r:], axis=3).reshape((1, emb_size))
    norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    
    uni_cosine=cosine(sum_uni_l, sum_uni_r)
    '''
    linear=Linear(sum_uni_l, sum_uni_r)
    poly=Poly(sum_uni_l, sum_uni_r)
    sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    rbf=RBF(sum_uni_l, sum_uni_r)
    gesd=GESD(sum_uni_l, sum_uni_r)
    '''
    eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2%
    #eucli_1=EUCLID(sum_uni_l, sum_uni_r)
    
    len_l=norm_length_l.reshape((1,1))
    len_r=norm_length_r.reshape((1,1))  
    
    '''
    len_l=length_l.reshape((1,1))
    len_r=length_r.reshape((1,1))  
    '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    HL_layer_1_input=T.concatenate([
#                                 mts, 
                                eucli_1, #uni_cosine,norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, #
                                uni_cosine,
#                                 sum_uni_l,
#                                 sum_uni_r,
#                                 sum_uni_l+sum_uni_r,
                                1.0/(1.0+EUCLID(layer0_l_output_maxpool, layer0_r_output_maxpool)),
                                cosine(layer0_l_output_maxpool, layer0_r_output_maxpool),
                                layer0_l_output_maxpool,
                                layer0_r_output_maxpool,
                                T.sqrt((layer0_l_output_maxpool-layer0_r_output_maxpool)**2+1e-10),
                                
                                layer1.output_eucli_to_simi, #layer1.output_cosine,layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, #
                                layer1.output_cosine,
                                layer1.output_vector_l,
                                layer1.output_vector_r,
                                T.sqrt((layer1.output_vector_l-layer1.output_vector_r)**2+1e-10),
#                                 len_l, len_r
                                layer1.output_attentions
#                                 wmf,
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)

    HL_layer_1_input_with_extra=T.concatenate([#HL_layer_1_input,
                                mts, len_l, len_r
#                                 wmf
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)

    HL_layer_1_input_size=1+1+   1+1+3* nkerns[0]   +1+1+3*nkerns[0]+10*10
    
    HL_layer_1_input_with_extra_size = HL_layer_1_input_size+15+2
    
    HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[0], activation=T.tanh)
    HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[0], n_out=hidden_size[1], activation=T.tanh)
    
    LR_layer_input=T.concatenate([HL_layer_2.output, HL_layer_1.output, HL_layer_1_input],axis=1)
    LR_layer_input_with_extra=T.concatenate([HL_layer_2.output,  HL_layer_1_input_with_extra],axis=1)#HL_layer_1.output,
    
    LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=HL_layer_1_input_size+hidden_size[0]+hidden_size[1], n_out=2)
#     LR_layer_input=HL_layer_2.output
#     LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=hidden_size, n_out=2)

#     layer3=LogisticRegression(rng, input=layer3_input, n_in=15+1+1+2+3, n_out=2)
    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((LR_layer.W** 2).sum()+(HL_layer_2.W** 2).sum()+(HL_layer_1.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()
#     diversify_reg= Diversify_Reg(LR_layer.W.T)+Diversify_Reg(HL_layer_2.W.T)+Diversify_Reg(HL_layer_1.W.T)+Diversify_Reg(conv_W_into_matrix)
    cost_this =debug_print(LR_layer.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=cost_this+L2_weight*L2_reg#+Div_reg*diversify_reg
    

    test_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r,
                                  mts,wmf], [LR_layer.errors(y), LR_layer.y_pred, LR_layer_input_with_extra, y], on_unused_input='ignore',allow_input_downcast=True)



    params = LR_layer.params+ HL_layer_2.params+HL_layer_1.params+[conv_W, conv_b]+[embeddings]#+[embeddings]# + layer1.params 
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        clipped_grad = T.clip(grad_i, -0.5, 0.5)
        acc = acc_i + T.sqr(clipped_grad)
        updates.append((param_i, param_i - learning_rate * clipped_grad / T.sqrt(acc+1e-10)))   #AdaGrad
        updates.append((acc_i, acc))    
  
    train_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r,
                                  mts,wmf], [cost,LR_layer.errors(y)], updates=updates, on_unused_input='ignore',allow_input_downcast=True)

    train_model_predict = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r,
                                  mts,wmf], [cost_this,LR_layer.errors(y), LR_layer_input_with_extra, y],on_unused_input='ignore',allow_input_downcast=True)



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is


    best_params = None
    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    
    max_acc=0.0
    nn_max_acc=0.0
    best_iter=0
    cost_tmp=0.0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        shuffle(train_batch_start)#shuffle training data

        for index in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * train_size + minibatch_index +1

            minibatch_index=minibatch_index+1

#             if iter%update_freq != 0:
#                 cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
#                 #print 'cost_ij: ', cost_ij
#                 cost_tmp+=cost_ij
#                 error_sum+=error_ij
#             else:

            cost_i, error_i= train_model(indices_train_l[index: index + batch_size],
                                                              indices_train_r[index: index + batch_size],
                                                              trainY[index: index + batch_size],
                                                              trainLeftPad_l[index],
                                                              trainRightPad_l[index],
                                                              trainLeftPad_r[index],
                                                              trainRightPad_r[index],
                                                              trainLengths_l[index],
                                                              trainLengths_r[index],
                                                              normalized_train_length_l[index],
                                                              normalized_train_length_r[index],
                                                              mt_train[index: index + batch_size],
                                                              wm_train[index: index + batch_size])
            cost_tmp+=cost_i
            if iter < 6000 and iter %100 ==0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter)
            if iter >= 6000 and iter % 100 == 0:
#             if iter%100 ==0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter)
                test_losses=[]
                test_y=[]
                test_features=[]
                for index in test_batch_start:
                    test_loss, pred_y, layer3_input, y=test_model(indices_test_l[index: index + batch_size],
                                                                  indices_test_r[index: index + batch_size],
                                                                  testY[index: index + batch_size],
                                                                  testLeftPad_l[index],
                                                                  testRightPad_l[index],
                                                                  testLeftPad_r[index],
                                                                  testRightPad_r[index],
                                                                  testLengths_l[index],
                                                                  testLengths_r[index],
                                                                  normalized_test_length_l[index],
                                                                  normalized_test_length_r[index],
                                                                  mt_test[index: index + batch_size],
                                                                  wm_test[index: index + batch_size])
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                test_score = numpy.mean(test_losses)
                test_acc = (1-test_score) * 100.
                if test_acc > nn_max_acc:
                    nn_max_acc = test_acc
                print '\t\t\tepoch:', epoch, 'iter:', iter, 'current acc:', test_acc, 'nn_max_acc:', nn_max_acc

                #now, see the results of svm
                if use_svm:
                    train_y=[]
                    train_features=[]
                    for index in train_batch_start: 
                        cost_ij, error_ij, layer3_input, y=train_model_predict(indices_train_l[index: index + batch_size],
                                                                  indices_train_r[index: index + batch_size],
                                                                  trainY[index: index + batch_size],
                                                                  trainLeftPad_l[index],
                                                                  trainRightPad_l[index],
                                                                  trainLeftPad_r[index],
                                                                  trainRightPad_r[index],
                                                                  trainLengths_l[index],
                                                                  trainLengths_r[index],
                                                                  normalized_train_length_l[index],
                                                                  normalized_train_length_r[index],
                                                                  mt_train[index: index + batch_size],
                                                                  wm_train[index: index + batch_size])
                        train_y.append(y[0])
                        train_features.append(layer3_input[0])
                        #write_feature.write(' '.join(map(str,layer3_input[0]))+'\n')
                    #write_feature.close()
     
                    clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33
                    clf.fit(train_features, train_y)
                    results=clf.predict(test_features)
                    lr=LinearRegression().fit(train_features, train_y)
                    results_lr=lr.predict(test_features)
                    corr_count=0
                    corr_lr=0
                    test_size=len(test_y)
                    for i in range(test_size):
                        if results[i]==test_y[i]:
                            corr_count+=1
                        if numpy.absolute(results_lr[i]-test_y[i])<0.5:
                            corr_lr+=1
                    acc=corr_count*1.0/test_size
                    acc_lr=corr_lr*1.0/test_size
                    if acc > max_acc:
                        max_acc=acc
                        best_iter=iter
                    if acc_lr> max_acc:
                        max_acc=acc_lr
                        best_iter=iter
                    print '\t\t\t\tsvm acc: ', acc, 'LR acc: ', acc_lr, ' max acc: ',    max_acc , ' at iter: ', best_iter

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #42
0
def log_softmax(x):
    x_diff = x - x.max(1, keepdims=True)
    return x_diff - T.log(T.sum(T.exp(x_diff), axis=1, keepdims=True))
Example #43
0
    def __init__(self, **option):
        # source and target embedding dim
        sedim, tedim = option["embdim"]
        # source, target and attention hidden dim
        shdim, thdim, ahdim, domaindim, feadim = option["hidden"]
        # maxout hidden dim
        maxdim = option["maxhid"]
        # maxout part
        maxpart = option["maxpart"]
        # deepout hidden dim
        deephid = option["deephid"]
        svocab, tvocab = option["vocabulary"]
        sw2id, sid2w = svocab
        tw2id, tid2w = tvocab
        # source and target vocabulary size
        svsize, tvsize = len(sid2w), len(tid2w)
        dnum = option['dnum']

        if "scope" not in option or option["scope"] is None:
            option["scope"] = "rnnsearch"

        if "initializer" not in option:
            option["initializer"] = None

        if "regularizer" not in option:
            option["regularizer"] = None

        if "keep_prob" not in option:
            option["keep_prob"] = 1.0

        dtype = theano.config.floatX
        initializer = option["initializer"]
        regularizer = option["regularizer"]
        keep_prob = option["keep_prob"] or 1.0

        scope = option["scope"]
        decoder_scope = "decoder"

        encoder = Encoder(sedim, shdim)
        decoderType = eval("Decoder{}".format(option["decoder"]))
        decoder = decoderType(tedim,
                              thdim,
                              ahdim,
                              2 * shdim,
                              dnum=dnum,
                              dim_maxout=maxdim,
                              max_part=maxpart,
                              dim_readout=deephid,
                              dim_domain=domaindim,
                              feadim=feadim,
                              n_y_vocab=tvsize)

        # training graph
        with ops.variable_scope(scope,
                                initializer=initializer,
                                regularizer=regularizer,
                                dtype=dtype):
            src_seq = T.imatrix("source_sequence")
            src_mask = T.matrix("source_sequence_mask")
            tgt_seq = T.imatrix("target_sequence")
            tgt_mask = T.matrix("target_sequence_mask")
            tag_seq = T.imatrix("domain_tag")
            # nsrc_mask = T.set_subtensor(src_mask[T.cast(T.sum(src_mask, 0) - 1, 'int32'),
            #                                      T.arange(src_mask.shape[1])], 0.0)

            with ops.variable_scope("source_embedding"):
                source_embedding = ops.get_variable("embedding",
                                                    [svsize, sedim])
                source_bias = ops.get_variable("bias", [sedim])

            with ops.variable_scope("target_embedding") as tgtembscope:
                target_embedding = ops.get_variable("embedding",
                                                    [tvsize, tedim])
                # target_bias = ops.get_variable("bias", [tedim])
                decoder.tiescope = tgtembscope

            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)

            source_inputs = source_inputs + source_bias

            if keep_prob < 1.0:
                source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob)
                target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob)

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            with ops.variable_scope("Specific"):
                domain_alpha = domain_sensitive_attention(
                    annotation, src_mask, shdim * 2, domaindim)
                # domain_alpha = attention(r_states[0], annotation, nsrc_mask,
                #                          shdim,
                #                          shdim * 2)
                domain_context = T.sum(annotation * domain_alpha[:, :, None],
                                       0)
                dfeature = nn.feedforward(domain_context, [shdim * 2, feadim],
                                          True,
                                          activation=T.tanh,
                                          scope="feature1")

                dscores = nn.feedforward(dfeature, [feadim, dnum],
                                         True,
                                         activation=T.tanh,
                                         scope="score")
                # (batch, 2)
                dprobs = T.nnet.softmax(dscores)
                dpred_tag = T.argmax(dprobs, 1)
                didx = T.arange(tag_seq.flatten().shape[0])
                dce = -T.log(dprobs[didx, tag_seq.flatten()])
                dcost = T.mean(dce)

            share_alpha = domain_sensitive_attention(annotation, src_mask,
                                                     shdim * 2, domaindim)
            # share_alpha = attention(r_states[0], annotation, nsrc_mask,
            #                         shdim,
            #                         shdim * 2)
            share_context = T.sum(annotation * share_alpha[:, :, None], 0)
            sfeature = nn.feedforward(share_context, [shdim * 2, feadim],
                                      True,
                                      activation=T.tanh,
                                      scope="feature1")

            with ops.variable_scope("Shared"):
                sscores = nn.feedforward(sfeature, [feadim, dnum],
                                         True,
                                         activation=T.tanh,
                                         scope="score")
                # (batch, 2)
                sprobs = T.nnet.softmax(sscores)
                spred_tag = T.argmax(sprobs, 1)
                sidx = T.arange(tag_seq.flatten().shape[0])
                sce = -T.log(sprobs[sidx, tag_seq.flatten()])
                scost = T.mean(sce)
                adv_sce = -sprobs[sidx, tag_seq.flatten()] * T.log(
                    sprobs[sidx, tag_seq.flatten()])
                adv_scost = T.mean(adv_sce)

            domain_gate = nn.feedforward([dfeature, annotation],
                                         [[feadim, shdim * 2], shdim * 2],
                                         True,
                                         scope="domain_gate")
            domain_annotation = annotation * domain_gate
            domain_annotation = nn.dropout(domain_annotation,
                                           keep_prob=keep_prob)
            share_gate = nn.feedforward([sfeature, annotation],
                                        [[feadim, shdim * 2], shdim * 2],
                                        True,
                                        scope="share_gate")
            annotation = annotation * share_gate
            annotation = nn.dropout(annotation, keep_prob=keep_prob)

            # compute initial state for decoder
            # first state of backward encoder
            # batch * shdim
            final_state = T.concatenate([
                annotation[0, :, annotation.shape[-1] / 2:],
                domain_annotation[0, :, annotation.shape[-1] / 2:]
            ], -1)
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim * 2, thdim],
                                               True,
                                               scope="initial",
                                               activation=T.tanh)
                # keys for query
                mapped_keys = map_key(annotation, 2 * shdim, ahdim, "semantic")
                mapped_domain_keys = map_key(domain_annotation, 2 * shdim,
                                             ahdim, "domain")

                _, _, cost, tgtdcost, tpred_tag, _ = decoder.forward(
                    tgt_seq, target_inputs, tgt_mask, mapped_keys, src_mask,
                    annotation, initial_state, mapped_domain_keys,
                    domain_annotation, tag_seq, keep_prob)

        lamb = theano.shared(numpy.asarray(option["lambda"], dtype), "lambda")
        # cwscost *= lamb
        final_cost = cost + dcost + tgtdcost - lamb * adv_scost

        tag_inputs = [src_seq, src_mask]
        tag_outputs = [dpred_tag, spred_tag]
        tag_predict = theano.function(tag_inputs, tag_outputs)
        self.tag_predict = tag_predict

        tgt_tag_inputs = [src_seq, src_mask, tgt_seq, tgt_mask]
        tgt_tag_outputs = [tpred_tag]
        tgt_tag_predict = theano.function(tgt_tag_inputs, tgt_tag_outputs)
        self.tgt_tag_predict = tgt_tag_predict

        training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask, tag_seq]
        training_outputs = [cost, dcost, adv_scost, tgtdcost]

        self.cost_cla = scost
        self.inputs_cla = [src_seq, src_mask, tag_seq]
        self.outputs_cla = [scost]

        # decoding graph
        with ops.variable_scope(scope, reuse=True):
            prev_words = T.ivector("prev_words")

            # disable dropout
            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            source_inputs = source_inputs + source_bias

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            with ops.variable_scope("Specific"):
                domain_alpha = domain_sensitive_attention(
                    annotation, src_mask, shdim * 2, domaindim)
                # domain_alpha = attention(r_states[0], annotation, nsrc_mask,
                #                          shdim,
                #                          shdim * 2)
                domain_context = T.sum(annotation * domain_alpha[:, :, None],
                                       0)
                dfeature = nn.feedforward(domain_context, [shdim * 2, feadim],
                                          True,
                                          activation=T.tanh,
                                          scope="feature1")

            share_alpha = domain_sensitive_attention(annotation, src_mask,
                                                     shdim * 2, domaindim)
            # share_alpha = attention(r_states[0], annotation, nsrc_mask,
            #                         shdim,
            #                         shdim * 2)
            share_context = T.sum(annotation * share_alpha[:, :, None], 0)
            sfeature = nn.feedforward(share_context, [shdim * 2, feadim],
                                      True,
                                      activation=T.tanh,
                                      scope="feature1")

            domain_gate = nn.feedforward([dfeature, annotation],
                                         [[feadim, shdim * 2], shdim * 2],
                                         True,
                                         scope="domain_gate")
            domain_annotation = annotation * domain_gate
            share_gate = nn.feedforward([sfeature, annotation],
                                        [[feadim, shdim * 2], shdim * 2],
                                        True,
                                        scope="share_gate")
            annotation = annotation * share_gate

            # decoder
            final_state = T.concatenate([
                annotation[0, :, annotation.shape[-1] / 2:],
                domain_annotation[0, :, annotation.shape[-1] / 2:]
            ], -1)
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim * 2, thdim],
                                               True,
                                               scope="initial",
                                               activation=T.tanh)
                mapped_keys = map_key(annotation, 2 * shdim, ahdim, "semantic")
                mapped_domain_keys = map_key(domain_annotation, 2 * shdim,
                                             ahdim, "domain")

            prev_inputs = nn.embedding_lookup(target_embedding, prev_words)
            # prev_inputs = prev_inputs + target_bias

            cond = T.neq(prev_words, 0)
            # zeros out embedding if y is 0, which indicates <s>
            prev_inputs = prev_inputs * cond[:, None]

            with ops.variable_scope(decoder_scope):
                mask = T.ones_like(prev_words, dtype=dtype)
                next_state, context = decoder.step(prev_inputs, mask,
                                                   initial_state, mapped_keys,
                                                   annotation, src_mask,
                                                   mapped_domain_keys,
                                                   domain_annotation)
                if option["decoder"] == "GruSimple":
                    probs = decoder.prediction(prev_inputs, initial_state,
                                               context)
                elif option["decoder"] == "GruCond":
                    probs = decoder.prediction(prev_inputs, next_state,
                                               context)

        # encoding
        encoding_inputs = [src_seq, src_mask]
        encoding_outputs = [
            annotation, initial_state, mapped_keys, mapped_domain_keys,
            domain_annotation
        ]
        encode = theano.function(encoding_inputs, encoding_outputs)

        if option["decoder"] == "GruSimple":
            prediction_inputs = [
                prev_words, initial_state, annotation, mapped_keys, src_mask
            ]
            prediction_outputs = [probs, context]
            predict = theano.function(prediction_inputs, prediction_outputs)

            generation_inputs = [prev_words, initial_state, context]
            generation_outputs = next_state
            generate = theano.function(generation_inputs, generation_outputs)

            self.predict = predict
            self.generate = generate
        elif option["decoder"] == "GruCond":
            prediction_inputs = [
                prev_words, initial_state, annotation, mapped_keys, src_mask,
                mapped_domain_keys, domain_annotation
            ]
            prediction_outputs = [probs, next_state]
            predict = theano.function(prediction_inputs, prediction_outputs)
            self.predict = predict

        self.cost = final_cost
        self.inputs = training_inputs
        self.outputs = training_outputs
        self.updates = []
        # self.align = align
        # self.sample = sample
        self.encode = encode
        # self.get_snt_cost = get_snt_cost
        self.option = option
def RBF(sum_uni_l, sum_uni_r):
    eucli=T.sum((sum_uni_l-sum_uni_r)**2)
    return T.exp(-0.5*eucli).reshape((1,1))    
Example #45
0
 def get_error(self, input):
     L = 0.5 * T.sum(T.sqr(self.a_test - input), axis=1)
     return T.mean(L)
Example #46
0
    def training(self, fea2obj, batch_size, learning_rate=0.005, steprule='adagrad', wait_epochs=5, kl_weight_init=None, klw_ep=50, klw_inc_rate=0, num_epochs=None):
        networkfile = self._config['net']
        
        n_epochs = num_epochs or int(self._config['nepochs'])
        reg_weight=float(self._config['loss_weight'])
        reg_type=self._config['loss_reg']
        numtrain = int(self._config['num_train']) if 'num_train' in self._config else None
        train_stream, num_samples_train = get_comb_stream(fea2obj, 'train', batch_size, shuffle=True, num_examples=numtrain)
        dev_stream, num_samples_dev = get_comb_stream(fea2obj, 'dev', batch_size=None, shuffle=False)
        logger.info('sources: %s -- number of train/dev samples: %d/%d', train_stream.sources, num_samples_train, num_samples_dev)
        
        t2idx = fea2obj['targets'].t2idx
        klw_init = kl_weight_init or float(self._config['kld_weight']) if 'kld_weight' in self._config else 1
        logger.info('kl_weight_init: %d', klw_init)
        kl_weight = shared_floatx(klw_init, 'kl_weight')
        entropy_weight = shared_floatx(1., 'entropy_weight')
        
        cost, p_at_1, _, KLD, logpy_xz, pat1_recog, misclassify_rate= build_model_new(fea2obj, len(t2idx), self._config, kl_weight, entropy_weight)
        
        cg = ComputationGraph(cost)
        
        weights = VariableFilter(roles=[WEIGHT])(cg.parameters)
        logger.info('Model weights are: %s', weights)
        if 'L2' in reg_type:
            cost += reg_weight * l2_norm(weights)
            logger.info('applying %s with weight: %f ', reg_type, reg_weight)
          
        dropout = -0.1
        if dropout > 0:
            cg = apply_dropout(cg, weights, dropout)
            cost = cg.outputs[0]      
        
        cost.name = 'cost'
        logger.info('Our Algorithm is : %s, and learning_rate: %f', steprule, learning_rate)
        if 'adagrad' in steprule:
            cnf_step_rule = AdaGrad(learning_rate)
        elif 'adadelta' in steprule:
            cnf_step_rule = AdaDelta(decay_rate=0.95)
        elif 'decay' in steprule:
            cnf_step_rule = RMSProp(learning_rate=learning_rate, decay_rate=0.90)
            cnf_step_rule = CompositeRule([cnf_step_rule, StepClipping(1)])
        elif 'momentum' in steprule:
            cnf_step_rule = Momentum(learning_rate=learning_rate, momentum=0.9)
        elif 'adam' in steprule:
            cnf_step_rule = Adam(learning_rate=learning_rate)
        else:
            logger.info('The steprule param is wrong! which is: %s', steprule)
        
        algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=cnf_step_rule, on_unused_sources='warn')
        #algorithm.add_updates(updates)
        gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
        step_norm = aggregation.mean(algorithm.total_step_norm)
        monitored_vars = [cost, gradient_norm, step_norm, p_at_1, KLD, logpy_xz, kl_weight, pat1_recog]
        train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True,
                                               before_first_epoch=True, prefix='tra')
        
        dev_monitor = DataStreamMonitoring(variables=[cost, p_at_1, KLD, logpy_xz, pat1_recog, misclassify_rate], after_epoch=True,
                                           before_first_epoch=True, data_stream=dev_stream, prefix="dev")
        
        extensions = [dev_monitor, train_monitor, Timing(), 
                      TrackTheBest('dev_cost'),
                      FinishIfNoImprovementAfter('dev_cost_best_so_far', epochs=wait_epochs),
                      Printing(after_batch=False), #, ProgressBar()
                      FinishAfter(after_n_epochs=n_epochs),
                      saveload.Load(networkfile+'.toload.pkl'),
                      ] + track_best('dev_cost', networkfile+ '.best.pkl')
                      
        #extensions.append(SharedVariableModifier(kl_weight,
        #                                          lambda n, klw: numpy.cast[theano.config.floatX] (klw_inc_rate + klw), after_epoch=False, every_n_epochs=klw_ep, after_batch=False))
#         extensions.append(SharedVariableModifier(entropy_weight,
#                                                   lambda n, crw: numpy.cast[theano.config.floatX](crw - klw_inc_rate), after_epoch=False, every_n_epochs=klw_ep, after_batch=False))
        
        logger.info('number of parameters in the model: %d', tensor.sum([p.size for p in cg.parameters]).eval())
        logger.info('Lookup table sizes: %s', [p.size.eval() for p in cg.parameters if 'lt' in p.name])
        
        main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm,
                             model=Model(cost), extensions=extensions)
        main_loop.run()
 def Hx_plain():
     Hx_plain_splits = TT.grad(TT.sum(
         [TT.sum(g * x) for g, x in zip(constraint_grads, xs)]),
                               wrt=params,
                               disconnected_inputs='warn')
     return TT.concatenate([TT.flatten(s) for s in Hx_plain_splits])
def log_softmax(x):
    xdev = x - x.max(1, keepdims=True)
    return xdev - T.log(T.sum(T.exp(xdev), axis=1, keepdims=True))
Example #49
0
def find_threshold(images):
    result, updates = th.scan(fn=lambda i: find_val(i),
                              outputs_info=None,
                              sequences=images)
    return T.sum(result) / images.shape[0]
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=100,
                    emb_size=40,
                    batch_size=50,
                    describ_max_len=20,
                    type_size=12,
                    filter_size=[3, 5],
                    maxSentLen=100,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    all_sentences, all_masks, all_labels, word2id = load_BBN_multi_labels_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    label_sent, label_mask = load_SF_type_descriptions(word2id, type_size,
                                                       describ_max_len)
    label_sent = np.asarray(label_sent, dtype='int32')
    label_mask = np.asarray(label_mask, dtype=theano.config.floatX)

    train_sents = np.asarray(all_sentences[0], dtype='int32')
    train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX)
    train_labels = np.asarray(all_labels[0], dtype='int32')
    train_size = len(train_labels)

    dev_sents = np.asarray(all_sentences[1], dtype='int32')
    dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX)
    dev_labels = np.asarray(all_labels[1], dtype='int32')
    dev_size = len(dev_labels)
    '''
    combine train and dev
    '''
    train_sents = np.concatenate([train_sents, dev_sents], axis=0)
    train_masks = np.concatenate([train_masks, dev_masks], axis=0)
    train_labels = np.concatenate([train_labels, dev_labels], axis=0)
    train_size = train_size + dev_size

    test_sents = np.asarray(all_sentences[2], dtype='int32')
    test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX)
    test_labels = np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_labels)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_fasttext_multiple_word2vec_given_file([
        emb_root + 'IL5-cca-wiki-lorelei-d40.eng.vec',
        emb_root + 'IL5-cca-wiki-lorelei-d40.IL5.vec'
    ], 40)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12

    des_id_matrix = T.imatrix()
    des_mask = T.fmatrix()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2)
    repeat_common_input = T.repeat(
        normalize_tensor3_colwise(common_input), type_size,
        axis=0)  #(batch_size*type_size, emb_size, maxsentlen)

    des_input = embeddings[des_id_matrix.flatten()].reshape(
        (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1)
    bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1),
                    axis=2)  #(tyope_size, emb_size)
    repeat_des_input = T.tile(
        normalize_tensor3_colwise(des_input),
        (batch_size, 1, 1))  #(batch_size*type_size, emb_size, maxsentlen)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2]

    conv_att_W, conv_att_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    conv_att_W2, conv_att_b2 = create_conv_para(rng,
                                                filter_shape=(hidden_size[0],
                                                              1, emb_size,
                                                              filter_size[1]))
    conv_W_context2, conv_b_context2 = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    ACNN_para = [
        conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2,
        conv_W_context2
    ]

    # NN_para = multiCNN_para+ACNN_para

    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        W=conv_W2,
        b=conv_b2
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    LR_input = T.concatenate([sent_embeddings, sent_embeddings2, bow_emb],
                             axis=1)
    LR_input_size = hidden_size[0] * 2 + emb_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(
        rng, 12, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]
    layer_LR = LogisticRegression(
        rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(layer_LR.before_softmax)  #batch * 12
    prob_pos = T.where(labels < 1, 1.0 - score_matrix, score_matrix)

    loss = -T.mean(T.log(prob_pos))
    '''
    GRU
    '''
    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0])
    GRU_NN_para = [
        U1, W1, b1
    ]  #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
    # gru_input = common_input.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
    gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask,
                                                 hidden_size[0], U1, W1, b1)
    gru_sent_embeddings = gru_layer.output_sent_rep  # (batch_size, hidden_size)

    LR_att_input = T.concatenate([gru_sent_embeddings, bow_emb], axis=1)
    LR_att_input_size = hidden_size[0] + emb_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_att_a = create_ensemble_para(
        rng, 12, LR_att_input_size)  # the weight matrix hidden_size*2
    LR_att_b = theano.shared(value=np.zeros((12, ),
                                            dtype=theano.config.floatX),
                             name='LR_b',
                             borrow=True)  #bias for each target class
    LR_att_para = [U_att_a, LR_att_b]
    layer_att_LR = LogisticRegression(
        rng,
        input=LR_att_input,
        n_in=LR_att_input_size,
        n_out=12,
        W=U_att_a,
        b=LR_att_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    att_score_matrix = T.nnet.sigmoid(layer_att_LR.before_softmax)  #batch * 12
    att_prob_pos = T.where(labels < 1, 1.0 - att_score_matrix,
                           att_score_matrix)

    att_loss = -T.mean(T.log(att_prob_pos))
    '''
    ACNN
    '''
    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W,
        b=conv_att_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W2,
        b=conv_att_b2,
        W_context=conv_W_context2,
        b_context=conv_b_context2)
    sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l
    acnn_LR_input = T.concatenate(
        [sent_att_embeddings, sent_att_embeddings2, bow_emb], axis=1)
    acnn_LR_input_size = hidden_size[0] * 2 + emb_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    acnn_U_a = create_ensemble_para(
        rng, 12, acnn_LR_input_size)  # the weight matrix hidden_size*2
    acnn_LR_b = theano.shared(value=np.zeros((12, ),
                                             dtype=theano.config.floatX),
                              name='LR_b',
                              borrow=True)  #bias for each target class
    acnn_LR_para = [acnn_U_a, acnn_LR_b]
    acnn_layer_LR = LogisticRegression(
        rng,
        input=acnn_LR_input,
        n_in=acnn_LR_input_size,
        n_out=12,
        W=acnn_U_a,
        b=acnn_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    acnn_score_matrix = T.nnet.sigmoid(
        acnn_layer_LR.before_softmax)  #batch * 12
    acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix,
                            acnn_score_matrix)

    acnn_loss = -T.mean(T.log(acnn_prob_pos))
    '''
    dataless cosine
    '''
    cosine_scores = normalize_matrix_rowwise(bow_emb).dot(
        normalize_matrix_rowwise(bow_des).T)
    cosine_score_matrix = T.nnet.sigmoid(
        cosine_scores)  #(batch_size, type_size)
    '''
    dataless top-30 fine grained cosine
    '''
    fine_grained_cosine = T.batched_dot(
        repeat_common_input.dimshuffle(0, 2, 1),
        repeat_des_input)  #(batch_size*type_size,maxsentlen,describ_max_len)
    fine_grained_cosine_to_matrix = fine_grained_cosine.reshape(
        (batch_size * type_size, maxSentLen * describ_max_len))
    sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix,
                                                axis=1)
    top_k_simi = sort_fine_grained_cosine_to_matrix[:,
                                                    -30:]  # (batch_size*type_size, 5)
    max_fine_grained_cosine = T.mean(top_k_simi, axis=1)
    top_k_cosine_scores = max_fine_grained_cosine.reshape(
        (batch_size, type_size))
    top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores)

    params = multiCNN_para + LR_para + GRU_NN_para + LR_att_para + ACNN_para + acnn_LR_para  # put all model parameters together
    cost = loss + att_loss + acnn_loss + 1e-4 * ((conv_W**2).sum() +
                                                 (conv_W2**2).sum())
    updates = Gradient_Cost_Para(cost, params, learning_rate)
    '''
    testing
    '''

    ensemble_NN_scores = T.max(T.concatenate([
        att_score_matrix.dimshuffle('x', 0, 1),
        score_matrix.dimshuffle('x', 0, 1),
        acnn_score_matrix.dimshuffle('x', 0, 1)
    ],
                                             axis=0),
                               axis=0)
    # '''
    # majority voting, does not work
    # '''
    # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0)
    # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0)
    # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0)
    # binarize_conc =  T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0)
    # sum_binarize_conc = T.sum(binarize_conc,axis=0)
    # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0)
    # '''
    # sum up prob, works
    # '''
    # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix
    # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0)
    '''
    sum up prob, works
    '''
    ensemble_scores = 0.6 * ensemble_NN_scores + 0.4 * 0.5 * (
        cosine_score_matrix + top_k_score_matrix)
    binarize_prob = T.where(ensemble_scores > 0.3, 1, 0)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function(
        [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function(
        [sents_id_matrix, sents_mask, des_id_matrix, des_mask],
        binarize_prob,
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    # max_acc_dev=0.0
    max_meanf1_test = 0.0
    max_weightf1_test = 0.0
    train_indices = range(train_size)
    cost_i = 0.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_model(train_sents[train_id_batch],
                                  train_masks[train_id_batch],
                                  train_labels[train_id_batch], label_sent,
                                  label_mask)

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                error_sum = 0.0
                all_pred_labels = []
                all_gold_labels = []
                for test_batch_id in test_batch_start:  # for each test batch
                    pred_labels = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size],
                        label_sent, label_mask)
                    gold_labels = test_labels[test_batch_id:test_batch_id +
                                              batch_size]
                    # print 'pred_labels:', pred_labels
                    # print 'gold_labels;', gold_labels
                    all_pred_labels.append(pred_labels)
                    all_gold_labels.append(gold_labels)
                all_pred_labels = np.concatenate(all_pred_labels)
                all_gold_labels = np.concatenate(all_gold_labels)

                test_mean_f1, test_weight_f1 = average_f1_two_array_by_col(
                    all_pred_labels, all_gold_labels)
                if test_weight_f1 > max_weightf1_test:
                    max_weightf1_test = test_weight_f1
                if test_mean_f1 > max_meanf1_test:
                    max_meanf1_test = test_mean_f1
                print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #51
0
def main():
    sym_y = T.imatrix('target_output')
    sym_mask = T.matrix('mask')
    sym_x = T.tensor3()

    TOL = 1e-5
    num_epochs = config.epochs
    batch_size = config.batch_size

    #### DATA ####
    #    print "@@@@TESTING@@@@"
    #    l_in = nn.layers.InputLayer(shape=(None, 700, 42))
    #    l_dim_a = nn.layers.DimshuffleLayer(
    #        l_in, (0,2,1))
    #    l_conv_a = nn.layers.Conv1DLayer(
    #        incoming=l_dim_a, num_filters=42, border_mode='same',
    #        filter_size=3, stride=1, nonlinearity=nn.nonlinearities.rectify)
    #    l_dim_b = nn.layers.DimshuffleLayer(
    #        l_conv_a, (0,2,1))
    #    out = nn.layers.get_output(l_dim_b, sym_x)
    #    testvar = np.ones((128, 700, 42)).astype('float32')
    #    print "@@@@EVAL@@@@"
    #    john = out.eval({sym_x: testvar})
    #    print("Johns shape")
    #    print(john.shape)


    print("Building network ...")
    ##########################DEBUG##########################
    l_in, l_out = config.build_model()

    ##########################DEBUG##########################
    all_layers = nn.layers.get_all_layers(l_out)
    num_params = nn.layers.count_params(l_out)
    print("  number of parameters: %d" % num_params)
    print("  layer output shapes:")
    for layer in all_layers:
        name = string.ljust(layer.__class__.__name__, 32)
        print("    %s %s" % (name, nn.layers.get_output_shape(layer)))
    print("Creating cost function")
    # lasagne.layers.get_output produces a variable for the output of the net
    out_train = nn.layers.get_output(
        l_out, sym_x, deterministic=False)

    #    testvar = np.ones((128, 700, 42)).astype('float32')
    #    john = out_train.eval({sym_x: testvar})
    #    print("@@@@@JOHN@@@@@")
    #    print(john.shape)
    #    print(john.reshape((-1, num_classes)).shape)

    print("Creating eval function")
    out_eval = nn.layers.get_output(
        l_out, sym_x, deterministic=True)

    probs_flat = out_train.reshape((-1, num_classes))

    lambda_reg = config.lambda_reg
    params = nn.layers.get_all_params(l_out, regularizable=True)
    reg_term = sum(T.sum(p ** 2) for p in params)
    cost = T.nnet.categorical_crossentropy(T.clip(probs_flat, TOL, 1 - TOL), sym_y.flatten())
    cost = T.sum(cost * sym_mask.flatten()) / T.sum(sym_mask) + lambda_reg * reg_term

    # Retrieve all parameters from the network
    all_params = nn.layers.get_all_params(l_out, trainable=True)
    # Setting the weights
    if hasattr(config, 'set_weights'):
        nn.layers.set_all_param_values(l_out, config.set_weights())
    # Compute SGD updates for training
    print("Computing updates ...")
    if hasattr(config, 'learning_rate_schedule'):
        learning_rate_schedule = config.learning_rate_schedule  # Import learning rate schedule
    else:
        learning_rate_schedule = {0: config.learning_rate}
    learning_rate = theano.shared(np.float32(learning_rate_schedule[0]))

    all_grads = T.grad(cost, all_params)

    cut_norm = config.cut_grad
    updates, norm_calc = nn.updates.total_norm_constraint(all_grads, max_norm=cut_norm, return_norm=True)

    if optimizer == "rmsprop":
        updates = nn.updates.rmsprop(updates, all_params, learning_rate)
    elif optimizer == "adadelta":
        updates = nn.updates.adadelta(updates, all_params, learning_rate)
    elif optimizer == "adagrad":
        updates = nn.updates.adagrad(updates, all_params, learning_rate)
    elif optimizer == "nag":
        momentum_schedule = config.momentum_schedule
        momentum = theano.shared(np.float32(momentum_schedule[0]))
        updates = nn.updates.nesterov_momentum(updates, all_params, learning_rate, momentum)
    else:
        sys.exit("please choose either <rmsprop/adagrad/adadelta/nag> in configfile")

    # Theano functions for training and computing cost
    print("config.batch_size %d" % batch_size)
    print("data.num_classes %d" % num_classes)
    if hasattr(config, 'build_model'):
        print("has build model")
    print("Compiling train ...")
    # Use this for training (see deterministic = False above)
    train = theano.function(
        [sym_x, sym_y, sym_mask], [cost, out_train, norm_calc], updates=updates)

    print("Compiling eval ...")
    # use this for eval (deterministic = True + no updates)
    eval = theano.function([sym_x, sym_y, sym_mask], [cost, out_eval])

    # Start timers
    start_time = time.time()
    prev_time = start_time

    all_losses_train = []
    all_accuracy_train = []
    all_losses_eval_train = []
    all_losses_eval_valid = []
    all_losses_eval_test = []
    all_accuracy_eval_train = []
    all_accuracy_eval_valid = []
    all_accuracy_eval_test = []
    all_mean_norm = []

    import data
    X_train, X_valid, y_train, y_valid, mask_train, mask_valid, num_seq_train \
        = data.get_train()
    print("y shape")
    print(y_valid.shape)
    print("X shape")
    print(X_valid.shape)
    # Start training

    for epoch in range(num_epochs):

        if (epoch % 10) == 0:
            print("Epoch %d of %d" % (epoch + 1, num_epochs))

        if epoch in learning_rate_schedule:
            lr = np.float32(learning_rate_schedule[epoch])
            print("  setting learning rate to %.7f" % lr)
            learning_rate.set_value(lr)
        if optimizer == "nag":
            if epoch in momentum_schedule:
                mu = np.float32(momentum_schedule[epoch])
                print("  setting learning rate to %.7f" % mu)
                momentum.set_value(mu)
        print("Shuffling data")
        seq_names = np.arange(0, num_seq_train)
        np.random.shuffle(seq_names)
        X_train = X_train[seq_names]
        y_train = y_train[seq_names]
        mask_train = mask_train[seq_names]

        num_batches = num_seq_train // batch_size
        losses = []
        preds = []
        norms = []
        for i in range(num_batches):
            idx = range(i * batch_size, (i + 1) * batch_size)
            x_batch = X_train[idx]
            y_batch = y_train[idx]
            mask_batch = mask_train[idx]
            loss, out, batch_norm = train(x_batch, y_batch, mask_batch)
            print(batch_norm)
            norms.append(batch_norm)
            preds.append(out)
            losses.append(loss)

        # if ((i+1) % config.write_every_batch == 0) | (i == 0):
        #                if i == 0:
        #                    start_place = 0
        #                else:
        #                    start_place = i-config.write_every_batch
        #                print "Batch %d of %d" % (i + 1, num_batches)
        #                print "  curbatch training loss: %.5f" % np.mean(losses[start_place:(i+1)])
        #                print "  curbatch training acc: %.5f" % np.mean(accuracy[start_place:(i+1)])
        predictions = np.concatenate(preds, axis=0)
        loss_train = np.mean(losses)
        all_losses_train.append(loss_train)

        acc_train = utils.proteins_acc(predictions, y_train[0:num_batches * batch_size],
                                       mask_train[0:num_batches * batch_size])
        all_accuracy_train.append(acc_train)

        mean_norm = np.mean(norms)
        all_mean_norm.append(mean_norm)

        if 1 == 1:
            print("  average training loss: %.5f" % loss_train)
            print("  average training accuracy: %.5f" % acc_train)
            print("  average norm: %.5f" % mean_norm)

            sets = [  # ('train', X_train, y_train, mask_train, all_losses_eval_train, all_accuracy_eval_train),
                ('valid', X_valid, y_valid, mask_valid, all_losses_eval_valid, all_accuracy_eval_valid)]
            for subset, X, y, mask, all_losses, all_accuracy in sets:
                print("  validating: %s loss" % subset)
                preds = []
                num_batches = np.size(X, axis=0) // config.batch_size
                for i in range(num_batches):  ## +1 to get the "rest"
                    print(i)
                    idx = range(i * batch_size, (i + 1) * batch_size)
                    x_batch = X[idx]
                    y_batch = y[idx]
                    mask_batch = mask[idx]
                    loss, out = eval(x_batch, y_batch, mask_batch)
                    preds.append(out)
                    #                    acc = utils.proteins_acc(out, y_batch, mask_batch)
                    losses.append(loss)
                # accuracy.append(acc)
                predictions = np.concatenate(preds, axis=0)
                print
                "  pred"
                print(predictions.shape)
                print(predictions.dtype)
                loss_eval = np.mean(losses)
                all_losses.append(loss_eval)

                #                acc_eval = np.mean(accuracy)
                acc_eval = utils.proteins_acc(predictions, y, mask)
                all_accuracy.append(acc_eval)

                print
                "  average evaluation loss (%s): %.5f" % (subset, loss_eval)
                print
                "  average evaluation accuracy (%s): %.5f" % (subset, acc_eval)

        now = time.time()
        time_since_start = now - start_time
        time_since_prev = now - prev_time
        prev_time = now
        est_time_left = time_since_start * num_epochs
        eta = datetime.now() + timedelta(seconds=est_time_left)
        eta_str = eta.strftime("%c")
        print
        "  %s since start (%.2f s)" % (utils.hms(time_since_start), time_since_prev)
        print
        "  estimated %s to go (ETA: %s)" % (utils.hms(est_time_left), eta_str)
        print

        if (epoch >= config.start_saving_at) and ((epoch % config.save_every) == 0):
            print
            "  saving parameters and metadata"
            with open((metadata_path + "-%d" % (epoch) + ".pkl"), 'w') as f:
                pickle.dump({
                    'config_name': config_name,
                    'param_values': nn.layers.get_all_param_values(l_out),
                    'losses_train': all_losses_train,
                    'accuracy_train': all_accuracy_train,
                    'losses_eval_train': all_losses_eval_train,
                    'losses_eval_valid': all_losses_eval_valid,
                    'losses_eval_test': all_losses_eval_test,
                    'accuracy_eval_valid': all_accuracy_eval_valid,
                    'accuracy_eval_train': all_accuracy_eval_train,
                    'accuracy_eval_test': all_accuracy_eval_test,
                    'mean_norm': all_mean_norm,
                    'time_since_start': time_since_start,
                    'i': i,
                }, f, pickle.HIGHEST_PROTOCOL)

            print("  stored in %s" % metadata_path)

    print
Example #52
0
def check(X, i, j, a, b):
    k = T.sum(X[:, i:i + a, j:j + b])
    return k
    def import_F(self, train_set_x, train_weight, train_label, batch_size):

        index = T.iscalar()

        print('import_Modeling...')

        self.f = {
            n: theano.function(
                [],
                f,
                name=n,
                givens={
                    self.X: train_set_x,
                    self.Xlabel: train_label,
                    self.Weight: train_weight
                },
                on_unused_input='ignore')
            for n, f in zip(['U', 'KL_U', 'KL_X'],
                            [self.U, self.KL_U, self.KL_X])
        }

        self.f['LL'] = theano.function(
            [index],
            outputs=self.LL,
            givens={
                self.X:
                train_set_x[index * batch_size:(index + 1) * batch_size],
                self.Xlabel:
                train_label[index * batch_size:(index + 1) * batch_size],
                self.Weight:
                train_weight
            },
            on_unused_input='ignore')

        z = 0.0 * sum([T.sum(v) for v in self.params])
        self.g = {
            vn: {
                gn: theano.function(
                    [],
                    T.grad(gv + z, vv),
                    name='d' + gn + '_d' + vn,
                    givens={
                        self.X: train_set_x,
                        self.Xlabel: train_label,
                        self.Weight: train_weight
                    },
                    on_unused_input='ignore')
                for gn, gv in zip(['KL_U', 'KL_X'], [self.KL_U, self.KL_X])
            }
            for vn, vv in self.wrt.items()
        }

        for vn, vv in self.wrt.items():
            self.g[vn]['LL'] = theano.function(
                [index],
                T.grad(self.LL + z, vv),
                name='dLL' + '_d' + vn,
                givens={
                    self.X:
                    train_set_x[index * batch_size:(index + 1) * batch_size],
                    self.Xlabel:
                    train_label[index * batch_size:(index + 1) * batch_size],
                    self.Weight:
                    train_weight
                },
                on_unused_input='ignore')
Example #54
0
    def __init__(self, We_initial, params):
        initial_We = theano.shared(np.asarray(We_initial, dtype=config.floatX))
        We = theano.shared(np.asarray(We_initial, dtype=config.floatX))

        #symbolic params
        g1batchindices = T.imatrix()
        g2batchindices = T.imatrix()
        p1batchindices = T.imatrix()
        p2batchindices = T.imatrix()

        #get embeddings
        l_in = lasagne.layers.InputLayer((None, None, 1))
        l_emb = lasagne.layers.EmbeddingLayer(
            l_in,
            input_size=We.get_value().shape[0],
            output_size=We.get_value().shape[1],
            W=We)
        l_average = lasagne_average_layer([l_emb])

        embg1 = lasagne.layers.get_output(l_average, {l_in: g1batchindices})
        embg2 = lasagne.layers.get_output(l_average, {l_in: g2batchindices})
        embp1 = lasagne.layers.get_output(l_average, {l_in: p1batchindices})
        embp2 = lasagne.layers.get_output(l_average, {l_in: p2batchindices})

        #objective function
        g1g2 = (embg1 * embg2).sum(axis=1)
        g1g2norm = T.sqrt(T.sum(embg1**2, axis=1)) * T.sqrt(
            T.sum(embg2**2, axis=1))
        g1g2 = g1g2 / g1g2norm

        p1g1 = (embp1 * embg1).sum(axis=1)
        p1g1norm = T.sqrt(T.sum(embp1**2, axis=1)) * T.sqrt(
            T.sum(embg1**2, axis=1))
        p1g1 = p1g1 / p1g1norm

        p2g2 = (embp2 * embg2).sum(axis=1)
        p2g2norm = T.sqrt(T.sum(embp2**2, axis=1)) * T.sqrt(
            T.sum(embg2**2, axis=1))
        p2g2 = p2g2 / p2g2norm

        costp1g1 = params.margin - g1g2 + p1g1
        costp1g1 = costp1g1 * (costp1g1 > 0)

        costp2g2 = params.margin - g1g2 + p2g2
        costp2g2 = costp2g2 * (costp2g2 > 0)

        cost = costp1g1 + costp2g2

        self.all_params = lasagne.layers.get_all_params(l_average,
                                                        trainable=True)
        self.network_params = lasagne.layers.get_all_params(l_average,
                                                            trainable=True)
        self.network_params.pop(0)

        word_reg = 0.5 * params.LW * lasagne.regularization.l2(We - initial_We)
        cost = T.mean(cost) + word_reg

        #feedforward
        self.feedforward_function = theano.function([g1batchindices], embg1)
        self.cost_function = theano.function(
            [g1batchindices, g2batchindices, p1batchindices, p2batchindices],
            cost)

        prediction = g1g2

        self.scoring_function = theano.function(
            [g1batchindices, g2batchindices], prediction)

        #updates
        if params.updatewords:
            grads = theano.gradient.grad(cost, self.all_params)
            if params.clip:
                grads = [
                    lasagne.updates.norm_constraint(grad, params.clip,
                                                    range(grad.ndim))
                    for grad in grads
                ]
            updates = params.learner(grads, self.all_params, params.eta)
        else:
            grads = theano.gradient.grad(cost, self.network_params)
            if params.clip:
                grads = [
                    lasagne.updates.norm_constraint(grad, params.clip,
                                                    range(grad.ndim))
                    for grad in grads
                ]
            updates = params.learner(grads, self.network_params, params.eta)

        self.train_function = theano.function(
            [g1batchindices, g2batchindices, p1batchindices, p2batchindices],
            cost,
            updates=updates)
 def log_mvn(self, y, mean, beta):  #対角ノイズ、YはN×Dのデータ,それの正規分布の対数尤度
     N = y.shape[0]
     D = y.shape[1]
     return -0.5 * D * T.sum(T.log(2 * np.pi *
                                   (1 / T.diag(beta)))) - 0.5 * T.sum(
                                       T.dot(beta, (y - mean)**2))
 def LINnn(self, sl2, X):
     return sl2 * (T.sum(X**2, 1) + 1) + eps
Example #57
0
File: test.py Project: dfm/rvhmc
import numpy as np

import theano
import theano.tensor as tt
from theano.tests import unittest_tools as utt

from kepler_op import KeplerOp

kepler = KeplerOp()

M = tt.dmatrix()
e = tt.dmatrix()
E = kepler(M, e)

f = theano.function([M, e], E)
g = theano.function([M, e], theano.grad(tt.sum(E), [M, e]))

np.random.seed(42)
N = (10, 2)
pt = [np.random.uniform(5, 10, N), np.random.rand(*N)]

print(f(*pt))
utt.verify_grad(kepler, pt)
    def compile_F(self, train_set_x, train_weight, train_label, batch_size):

        index = T.iscalar()

        print('Modeling...')

        model_file_name = 'model2' + '.save'
        #もしこれまでに作ったのがあるならロードする
        try:
            print('Trying to load model...')
            with open(model_file_name, 'rb') as file_handle:
                obj = pickle.load(file_handle)
                self.f, self.g = obj
                print('Loaded!')
            return
        except:
            print('Failed. Creating a new model...')

        self.f = {
            n: theano.function(
                [],
                f,
                name=n,
                givens={
                    self.X: train_set_x,
                    self.Xlabel: train_label,
                    self.Weight: train_weight
                },
                on_unused_input='ignore')
            for n, f in zip(['U', 'KL_U', 'KL_X'],
                            [self.U, self.KL_U, self.KL_X])
        }

        self.f['LL'] = theano.function(
            [index],
            outputs=self.LL,
            givens={
                self.X:
                train_set_x[index * batch_size:(index + 1) * batch_size],
                self.Xlabel:
                train_label[index * batch_size:(index + 1) * batch_size],
                self.Weight:
                train_weight
            },
            on_unused_input='ignore')

        z = 0.0 * sum([T.sum(v) for v in self.params])
        self.g = {
            vn: {
                gn: theano.function(
                    [],
                    T.grad(gv + z, vv),
                    name='d' + gn + '_d' + vn,
                    givens={
                        self.X: train_set_x,
                        self.Xlabel: train_label,
                        self.Weight: train_weight
                    },
                    on_unused_input='ignore')
                for gn, gv in zip(['KL_U', 'KL_X'], [self.KL_U, self.KL_X])
            }
            for vn, vv in self.wrt.items()
        }

        for vn, vv in self.wrt.items():
            self.g[vn]['LL'] = theano.function(
                [index],
                T.grad(self.LL + z, vv),
                name='dLL' + '_d' + vn,
                givens={
                    self.X:
                    train_set_x[index * batch_size:(index + 1) * batch_size],
                    self.Xlabel:
                    train_label[index * batch_size:(index + 1) * batch_size],
                    self.Weight:
                    train_weight
                },
                on_unused_input='ignore')

        with open(model_file_name, 'wb') as file_handle:
            print('Saving model...')
            sys.setrecursionlimit(100000)
            pickle.dump([self.f, self.g],
                        file_handle,
                        protocol=pickle.HIGHEST_PROTOCOL)
Example #59
0
def cmmd(dataset='mnist.pkl.gz',batch_size=500, layer_num = 2, hidden_dim = 20,seed = 0,layer_size=[500,200,100]):

	validation_frequency = 1
	test_frequency = 1
	pre_train = 0
	pre_train_epoch = 30

	print "Loading data ......."
	datasets = datapy.load_data_gpu_60000(dataset, have_matrix = True)
	train_set_x, train_set_y, train_y_matrix = datasets[0]
	valid_set_x, valid_set_y, valid_y_matrix = datasets[1]
	test_set_x, test_set_y, test_y_matrix = datasets[2]

	n_train_batches = train_set_x.get_value().shape[0] / batch_size
	n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
	n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

	rng = np.random.RandomState(seed)                                                          
	rng_share = theano.tensor.shared_randomstreams.RandomStreams(0)

	################################
	##        build model         ##
	################################
	print "Building model ......."

	index = T.lscalar()
	x = T.matrix('x')  ##### batch_size * 28^2
	y = T.vector('y') 
	y_matrix = T.matrix('y_matrix') 
	random_z = T.matrix('random_z') ### batch_size * hidden_dim
	Inv_K_d = T.matrix('Inv_K_d')

	layers = []
	layer_output= []

	activation = nonlinearity.relu
	#activation = Tnn.sigmoid
	#### first layer
	layers.append(FullyConnected.FullyConnected(
			rng = rng,
			n_in = 28*28 + hidden_dim, 
			#n_in = 28*28, 
			n_out = layer_size[0],
			activation = activation
	))
	layer_output.append(layers[-1].output_mix(input=[x,random_z]))
	#layer_output.append(layers[-1].output(input=x))

	#### middle layer
	for i in range(layer_num):
		layers.append(FullyConnected.FullyConnected(
			rng = rng,
			n_in = layer_size[i], 
			n_out = layer_size[i+1],
			activation = activation
		))
		layer_output.append(layers[-1].output(input= layer_output[-1]))

	#### last layer
	activation = Tnn.sigmoid
	layers.append(FullyConnected.FullyConnected(
		rng = rng,
		n_in = layer_size[-1],
		n_out = 10,
		activation = activation
	))
	y_gen = layers[-1].output(input = layer_output[-1])
	
	lambda1_ = 1e-3
	lambda_= theano.shared(np.asarray(lambda1_, dtype=np.float32))


	K_d = kernel_gram_for_x(x,x,batch_size,28*28)
	K_s = K_d 
	K_sd = K_d
	#Inv_K_d = NL.matrix_inverse(K_d +lambda_ * T.identity_like(K_d))
	Inv_K_s = Inv_K_d

	L_d = kernel_gram(y_matrix,y_matrix,batch_size,10)
	L_s = kernel_gram(y_gen,y_gen,batch_size,10)
	L_ds = kernel_gram(y_matrix,y_gen,batch_size,10)
	
	cost = -(NL.trace(K_d * Inv_K_d * L_d * Inv_K_d) +\
			NL.trace(K_s * Inv_K_s * L_s * Inv_K_s)- \
			NL.trace(K_sd * Inv_K_d * L_ds * Inv_K_s))
	cost_pre = -T.sum(T.sqr(y_matrix - y_gen))

	cc = T.argmax(y_gen,axis=1)
	correct = T.sum(T.eq(T.cast(T.argmax(y_gen,axis=1),'int32'),T.cast(y,'int32')))

	################################
	##        updates             ##
	################################
	params = []
	for aLayer in layers:
		params += aLayer.params
	gparams = [T.grad(cost,param) for param in params]
	gparams_pre = [T.grad(cost_pre,param) for param in params]

	learning_rate = 3e-4
	weight_decay=1.0/n_train_batches
	epsilon=1e-8

	l_r = theano.shared(np.asarray(learning_rate, dtype=np.float32))
	get_optimizer = optimizer.get_adam_optimizer_max(learning_rate=l_r,
		decay1=0.1, decay2=0.001, weight_decay=weight_decay, epsilon=epsilon)
	updates = get_optimizer(params,gparams)

	updates_pre = get_optimizer(params,gparams_pre)


	################################
	##         pretrain model     ##
	################################
	parameters = theano.function(
			inputs = [],
			outputs = params,
			)

	'''
	pre_train_model = theano.function(
		inputs = [index,random_z],
		outputs = [cost_pre, correct],
		updates=updates_pre,
		givens={
			x:train_set_x[index * batch_size:(index + 1) * batch_size],
			y:train_set_y[index * batch_size:(index + 1) * batch_size],
			y_matrix:train_y_matrix[index * batch_size:(index + 1) * batch_size],
		},
		on_unused_input='warn'
		)
	cur_epoch = 0
	if pre_train == 1:
		for cur_epoch in range(pre_train_epoch):
			print 'cur_epoch: ', cur_epoch,
			cor = 0 
			for minibatch_index in range(n_train_batches):
				cost_pre_mini,correct_pre_mini = pre_train_model(minibatch_index,gen_random_z(batch_size,hidden_dim))
				cor = cor + correct_pre_mini
			print 'correct number: ' , cor
		#np.savez(,model = model)
		'''

	if pre_train == 1:
		print "pre-training model....."
		pre_train = np.load('model.npz')['model']
		for (para, pre) in zip(params, pre_train):
			para.set_value(pre)

	################################
	##         prepare data       ##
	################################

	#### compute matrix inverse
	print "Preparing data ...."
	Invv = NL.matrix_inverse(K_d +lambda_ * T.identity_like(K_d))
	prepare_data = theano.function(
			inputs = [index],
			outputs = [Invv,K_d],
			givens = {
				x:train_set_x[index * batch_size:(index + 1) * batch_size],
				}
			)

	Inv_K_d_l, K_d_l =  prepare_data(0)

	for minibatch_index in range(1, n_train_batches):
		if minibatch_index % 10 == 0:
			print 'minibatch_index:', minibatch_index
		Inv_pre_mini, K_d_pre_mini = prepare_data(minibatch_index)
		Inv_K_d_l = np.vstack((Inv_K_d_l,Inv_pre_mini))
		K_d_l = np.vstack((K_d_l,K_d_pre_mini))

	Inv_K_d_g = theano.shared(Inv_K_d_l,borrow=True)
	K_d_g = theano.shared(K_d_l, borrow=True)


	################################
	##         train model        ##
	################################

	train_model = theano.function(
		inputs = [index,random_z],
		outputs = [correct,cost,y,cc,y_gen],
		updates=updates,
		givens={
			x:train_set_x[index * batch_size:(index + 1) * batch_size],
			y:train_set_y[index * batch_size:(index + 1) * batch_size],
			y_matrix:train_y_matrix[index * batch_size:(index + 1) * batch_size],
			#K_d:K_d_g[index * batch_size:(index + 1) * batch_size],
			Inv_K_d:Inv_K_d_g[index * batch_size:(index + 1) * batch_size],
		},
		on_unused_input='warn'
	)

	valid_model = theano.function(
		inputs = [index,random_z],
		outputs = correct,
		#updates=updates,
		givens={
			x:valid_set_x[index * batch_size:(index + 1) * batch_size],
			y:valid_set_y[index * batch_size:(index + 1) * batch_size],
			y_matrix:valid_y_matrix[index * batch_size:(index + 1) * batch_size],
		},
		on_unused_input='warn'
	)

	test_model = theano.function(
		inputs = [index,random_z],
		outputs = [correct,y_gen],
		#updates=updates,
		givens={
			x:test_set_x[index * batch_size:(index + 1) * batch_size],
			y:test_set_y[index * batch_size:(index + 1) * batch_size],
			y_matrix:test_y_matrix[index * batch_size:(index + 1) * batch_size],
		},
		on_unused_input='warn'
	)

	n_epochs = 500
	cur_epoch = 0



	print "Training model ......"

	while (cur_epoch < n_epochs) :
		cur_epoch = cur_epoch + 1
		cor = 0
		for minibatch_index in xrange(n_train_batches):
			print minibatch_index,
			print " : ",
			correct,cost,a,b,y_gen = train_model(minibatch_index,gen_random_z(batch_size,hidden_dim))
			cor = cor + correct
			print correct
			print b
			print y_gen
		with open('log.txt','a') as f:
			print >>f , "epoch: " , cur_epoch, "training_correct: " , cor

		if cur_epoch % validation_frequency == 0:
			cor2 = 0
			for minibatch_index in xrange(n_valid_batches):
				correct = valid_model(minibatch_index,gen_random_z(batch_size,hidden_dim))
				cor2 = cor2 + correct
			with open('log.txt','a') as f:
				print >>f , "	validation_correct: " , cor2

		if cur_epoch % test_frequency == 0:
			cor2 = 0
			for minibatch_index in xrange(n_test_batches):
				correct,y_gen = test_model(minibatch_index,gen_random_z(batch_size,hidden_dim))
				with open('log.txt','a') as f:
					for index in range(batch_size):
						if not np.argmax(y_gen[index]) == test_set_y[minibatch_index * batch_size + index]:
							print >>f , "index: " , minibatch_index * batch_size + index, 'true Y: ', test_set_y[minibatch_index * batch_size + index]
							print >>f , 'gen_y: ' , y_gen[index]

				cor2 = cor2 + correct
			with open('log.txt','a') as f:
				print >>f , "	test_correct: " , cor2
		
		if epoch %1 == 0:
			model = parameters()
			for i in range(len(model)):
				model[i] = np.asarray(model[i]).astype(np.float32)
			np.savez('model-'+str(epoch),model=model)
    def __init__(self, D, M, Q, Domain_number, m, pre_params, Hiddenlayerdim1,
                 Hiddenlayerdim2):

        self.Xlabel = T.matrix('Xlabel')

        self.X = T.matrix('X')
        N = self.X.shape[0]

        self.Weight = T.matrix('Weight')

        ker = kernel(Q)
        mmd = MMD(M, Domain_number)

        mu_value = np.random.randn(M, D)
        Sigma_b_value = np.zeros((M, M)) + np.log(0.01)

        Z_value = m[:M]
        self.test = Z_value
        ls_value = np.zeros(Domain_number) + np.log(0.1)

        self.mu = theano.shared(value=mu_value, name='mu', borrow=True)
        self.Sigma_b = theano.shared(value=Sigma_b_value,
                                     name='Sigma_b',
                                     borrow=True)
        self.Z = theano.shared(value=Z_value, name='Z', borrow=True)
        self.ls = theano.shared(value=ls_value, name='ls', borrow=True)

        self.params = [self.mu, self.Sigma_b, self.Z, self.ls]

        self.hiddenLayer_x = HiddenLayer(rng=rng,
                                         input=self.X,
                                         n_in=D,
                                         n_out=Hiddenlayerdim1,
                                         activation=T.nnet.relu,
                                         number='_x')
        self.hiddenLayer_hidden = HiddenLayer(rng=rng,
                                              input=self.hiddenLayer_x.output,
                                              n_in=Hiddenlayerdim1,
                                              n_out=Hiddenlayerdim2,
                                              activation=T.nnet.relu,
                                              number='_h')
        self.hiddenLayer_m = HiddenLayer(rng=rng,
                                         input=self.hiddenLayer_hidden.output,
                                         n_in=Hiddenlayerdim2,
                                         n_out=Q,
                                         activation=T.nnet.relu,
                                         number='_m')
        self.hiddenLayer_S = HiddenLayer(rng=rng,
                                         input=self.hiddenLayer_hidden.output,
                                         n_in=Hiddenlayerdim2,
                                         n_out=Q,
                                         activation=T.nnet.relu,
                                         number='_S')

        self.loc_params = []
        self.loc_params.extend(self.hiddenLayer_x.params)
        self.loc_params.extend(self.hiddenLayer_hidden.params)
        self.loc_params.extend(self.hiddenLayer_m.params)
        self.loc_params.extend(self.hiddenLayer_S.params)

        self.local_params = {}
        for i in self.loc_params:
            self.local_params[str(i)] = i

        self.params.extend(ker.params)
        self.params.extend(mmd.params)

        self.hyp_params = {}
        for i in [self.mu, self.Sigma_b, self.ls]:
            self.hyp_params[str(i)] = i

        self.Z_params = {}
        for i in [self.Z]:
            self.Z_params[str(i)] = i

        self.global_params = {}
        for i in self.params:
            self.global_params[str(i)] = i

        self.params.extend(self.hiddenLayer_x.params)
        self.params.extend(self.hiddenLayer_hidden.params)
        self.params.extend(self.hiddenLayer_m.params)
        self.params.extend(self.hiddenLayer_S.params)

        self.wrt = {}
        for i in self.params:
            self.wrt[str(i)] = i

        for i, j in pre_params.items():
            self.wrt[i].set_value(j)

        m = self.hiddenLayer_m.output
        S_0 = self.hiddenLayer_S.output
        S_1 = T.exp(S_0)
        S = T.sqrt(S_1)

        from theano.tensor.shared_randomstreams import RandomStreams
        srng = RandomStreams(seed=234)
        eps_NQ = srng.normal((N, Q))
        eps_M = srng.normal((M, D))  #平均と分散で違う乱数を使う必要があるので別々に銘銘

        beta = T.exp(self.ls)
        #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある

        Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) +
                       T.diag(T.exp(T.diag(self.Sigma_b))))

        #スケール変換
        mu_scaled, Sigma_scaled = ker.sf2**0.5 * self.mu, ker.sf2**0.5 * Sigma

        Xtilda = m + S * eps_NQ
        self.U = mu_scaled + Sigma_scaled.dot(eps_M)

        Kmm = ker.RBF(self.Z)
        Kmm = mmd.MMD_kenel_Xonly(mmd.Zlabel_T, Kmm, self.Weight)
        KmmInv = sT.matrix_inverse(Kmm)

        Kmn = ker.RBF(self.Z, Xtilda)
        Kmn = mmd.MMD_kenel_ZX(self.Xlabel, Kmn, self.Weight)

        Knn = ker.RBF(Xtilda)
        Knn = mmd.MMD_kenel_Xonly(self.Xlabel, Knn, self.Weight)

        Ktilda = Knn - T.dot(Kmn.T, T.dot(KmmInv, Kmn))

        Kinterval = T.dot(KmmInv, Kmn)

        mean_U = T.dot(Kinterval.T, self.U)
        betaI = T.diag(T.dot(self.Xlabel, beta))
        Covariance = betaI

        self.LL = (self.log_mvn(self.X, mean_U, Covariance) -
                   0.5 * T.sum(T.dot(betaI, Ktilda)))
        self.KL_X = -self.KLD_X(m, S)
        self.KL_U = -self.KLD_U(mu_scaled, Sigma_scaled, Kmm, KmmInv)