Esempio n. 1
0
 def gru_cell(state_size):
     _cell = tf.nn.rnn_cell.GRUCell(
         state_size,
         activation=get_activation_by_name(
             self.args.activation))
     # _cell = tf.nn.rnn_cell.DropoutWrapper(_cell, output_keep_prob=0.5)
     return _cell
Esempio n. 2
0
 def lstm_cell(state_size):
     _cell = tf.nn.rnn_cell.LSTMCell(
         state_size,
         state_is_tuple=True,
         activation=get_activation_by_name(
             self.args.activation))
     # _cell = tf.nn.rnn_cell.DropoutWrapper(_cell, output_keep_prob=0.5)
     return _cell
Esempio n. 3
0
    def ready(self, args, train):
        # len * batch
        self.idxs = T.imatrix()
        self.idys = T.imatrix()
        self.init_state = T.matrix(dtype=theano.config.floatX)

        dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX)
        self.dropout = theano.shared(dropout_prob)

        self.n_d = args["hidden_dim"]

        embedding_layer = EmbeddingLayer(n_d=self.n_d,
                                         vocab=set(w for w in train))
        self.n_V = embedding_layer.n_V

        say("Vocab size: {}\tHidden dim: {}\n".format(self.n_V, self.n_d))

        activation = get_activation_by_name(args["activation"])

        rnn_layer = LSTM(n_in=self.n_d, n_out=self.n_d, activation=activation)

        output_layer = Layer(
            n_in=self.n_d,
            n_out=self.n_V,
            activation=T.nnet.softmax,
        )

        # (len*batch) * n_d
        x_flat = embedding_layer.forward(self.idxs.ravel())

        # len * batch * n_d
        x = apply_dropout(x_flat, self.dropout)
        x = x.reshape((self.idxs.shape[0], self.idxs.shape[1], self.n_d))

        # len * batch * (n_d+n_d)
        h = rnn_layer.forward_all(x, self.init_state, return_c=True)

        self.last_state = h[-1]
        h = h[:, :, self.n_d:]
        h = apply_dropout(h, self.dropout)

        self.p_y_given_x = output_layer.forward(h.reshape(x_flat.shape))

        idys = self.idys.ravel()
        self.nll = -T.log(self.p_y_given_x[T.arange(idys.shape[0]), idys])
        #self.nll = T.nnet.categorical_crossentropy(
        #                self.p_y_given_x,
        #                idys
        #            )

        self.layers = [embedding_layer, rnn_layer, output_layer]
        #self.params = [ x_flat ] + rnn_layer.params + output_layer.params
        self.params = embedding_layer.params + rnn_layer.params + output_layer.params
        self.num_params = sum(
            len(x.get_value(borrow=True).ravel()) for l in self.layers
            for x in l.params)
        say("# of params in total: {}\n".format(self.num_params))
Esempio n. 4
0
    def ready(self):
        args = self.args
        #n_domain = 2
        accum_dict = self.accum_dict = {}
        
        # len(sent) * len(doc) * batch
        s_idxs = self.s_idxs = T.itensor3()
        t_idxs = self.t_idxs = T.itensor3()

        # batch
        s_idys = self.s_idys = T.ivector()
        t_idys = self.t_idys = T.ivector()
        
        # batch
        s_dom_ids = self.s_dom_ids = T.ivector()
        t_dom_ids = self.t_dom_ids = T.ivector()
        
        # len(doc) * batch, 0: negative, 1: positive, -1: REL_UNK, -2, REL_PAD
        s_gold_rels = self.s_gold_rels = T.imatrix() 
        t_gold_rels = self.t_gold_rels = T.imatrix() 
        
        # has label flag, 0: no, 1: yes
        s_has_lab = self.s_has_lab = T.iscalar()
        t_has_lab = self.t_has_lab = T.iscalar()
        
        self.dropout = theano.shared(np.float64(args.dropout).astype(
                            theano.config.floatX))

        embedding_layer = self.embedding_layer
        if not embedding_layer.fix_init_embs:
            accum_dict[embedding_layer] = self.create_accumulators(embedding_layer)

        activation = get_activation_by_name(args.activation)
        n_d = self.n_d = args.hidden_dim
        n_e = self.n_e = embedding_layer.n_d
        n_c = self.nclasses
        self.rho = theano.shared(np.float64(0.0).astype(theano.config.floatX))

        self.source_k = 2

        # CNN to encode sentence into embedding
        cnn_layer = self.cnn_layer = LeCNN(
                n_in = n_e,
                n_out = n_d,
                activation=activation,
                order = args.cnn_window_size,
                BN = True,
            )
        accum_dict[cnn_layer] = self.create_accumulators(cnn_layer)
        
        # softmax layer to predict the label of the document
        self.lab_hid_layer = lab_hid_layer = Layer(
                n_in = n_d,
                n_out = n_d,
                activation = activation,
            )
        accum_dict[lab_hid_layer] = self.create_accumulators(lab_hid_layer)
        self.lab_out_layer = lab_out_layer = Layer(
                n_in = n_d,
                n_out = n_c,
                activation = logsoftmax,
            )
        accum_dict[lab_out_layer] = self.create_accumulators(lab_out_layer)
        
        # hidden layer to predict the domain of the document
        dom_hid_layer = self.dom_hid_layer = Layer(
                n_in = n_d,
                n_out = n_d,
                activation = activation,
            )
        accum_dict[dom_hid_layer] = self.create_accumulators(dom_hid_layer)

        # softmax layer to predict the domain of the document
        dom_out_layer = self.dom_out_layer = Layer(
                n_in = n_d,
                n_out = 2,
                activation = logsoftmax,
            )
        accum_dict[dom_out_layer] = self.create_accumulators(dom_out_layer)

        # for each domain, a vector parameter to compute the relevance score
        rel_hid_layer = self.rel_hid_layer =  Layer(
                n_in = n_d,
                n_out = n_d,
                activation = activation,
            )
        accum_dict[rel_hid_layer] = self.create_accumulators(rel_hid_layer)
        s_rel_out_layer = self.s_rel_out_layer =  Layer(
                n_in = n_d,
                n_out = 1,
                activation = sigmoid,
            )
        accum_dict[s_rel_out_layer] = self.create_accumulators(s_rel_out_layer)
        t_rel_out_layer = self.t_rel_out_layer =  Layer(
                n_in = n_d,
                n_out = 1,
                activation = sigmoid,
            )
        accum_dict[t_rel_out_layer] = self.create_accumulators(t_rel_out_layer)
        
        # transformation to domain independent layer
        trans_layer = self.trans_layer = Layer(
                n_in = n_d,
                n_out = n_d,
                activation = activation,
                has_bias=False,
                init_zero=True,
            )
        accum_dict[trans_layer] = self.create_accumulators(trans_layer)
        val = np.eye(n_d, dtype=theano.config.floatX)
        identity_mat = theano.shared(val)
        trans_layer.W.set_value(val)
        
        # reconstruction layer
        recon_layer = self.recon_layer = Layer(
                n_in = n_d,
                n_out = n_e,
                activation = tanh,
            )
        accum_dict[recon_layer] = self.create_accumulators(recon_layer)
        
        # construct network
        s_lab_loss, s_rel_loss, s_dom_loss, s_adv_loss, s_lab_prob, s_recon_loss = self.ready_one_domain(
                         s_idxs, s_idys, s_dom_ids, s_gold_rels, \
                         cnn_layer, rel_hid_layer, s_rel_out_layer, trans_layer, \
                         dom_hid_layer, dom_out_layer, lab_hid_layer, lab_out_layer)
        self.s_lab_loss, self.s_rel_loss, self.s_dom_loss, self.s_adv_loss, self.s_lab_prob, self.s_recon_loss = \
                        s_lab_loss, s_rel_loss, s_dom_loss, s_adv_loss, s_lab_prob, s_recon_loss
        
        t_lab_loss, t_rel_loss, t_dom_loss, t_adv_loss, t_lab_prob, t_recon_loss = self.ready_one_domain(
                         t_idxs, t_idys, t_dom_ids, t_gold_rels, \
                         cnn_layer, rel_hid_layer, t_rel_out_layer, trans_layer, \
                         dom_hid_layer, dom_out_layer, lab_hid_layer, lab_out_layer)
        self.t_lab_loss, self.t_rel_loss, self.t_dom_loss, self.t_adv_loss, self.t_lab_prob, self.t_recon_loss = \
                        t_lab_loss, t_rel_loss, t_dom_loss, t_adv_loss, t_lab_prob, t_recon_loss
        
        # transformation regularization
        trans_reg = self.trans_reg = args.trans_reg * T.sum((trans_layer.W - identity_mat) ** 2)
        
        # domain cost
        layers = [ dom_out_layer, dom_hid_layer ]
        self.dom_params = self.get_params(layers)
        self.dom_accums = self.get_accumulators(layers, accum_dict)
        self.dom_cost = s_dom_loss + t_dom_loss + args.l2_reg * self.get_l2_cost(self.dom_params)
        
        # label cost
        lab_layers = [ lab_out_layer, lab_hid_layer ]
        lab_params = self.get_params(lab_layers)
        lab_cost = s_has_lab * self.source_k * s_lab_loss + t_has_lab * t_lab_loss \
                    + args.l2_reg * (s_has_lab + t_has_lab) * self.get_l2_cost(lab_params)
            
        # total cost
        other_layers = [ cnn_layer, s_rel_out_layer, t_rel_out_layer, rel_hid_layer, trans_layer, recon_layer ]
        other_params = self.get_params(other_layers)
        self.other_cost_except_dom = lab_cost + s_rel_loss + t_rel_loss + s_adv_loss + t_adv_loss + trans_reg \
                     + s_recon_loss + t_recon_loss \
                     + args.l2_reg * self.get_l2_cost(other_params)
        self.other_params_except_dom = lab_params + other_params
        self.other_accums_except_dom = self.get_accumulators(lab_layers + other_layers, accum_dict)
        if not embedding_layer.fix_init_embs:
            self.other_params_except_dom += embedding_layer.params
            self.add_accumulators(self.other_accums_except_dom, embedding_layer, accum_dict)
        
        # info
        layers = lab_layers + other_layers + [ dom_out_layer, dom_hid_layer ]
        params = self.params = self.get_params(layers)
        if not embedding_layer.fix_init_embs:
            self.params += embedding_layer.params
        say("num of parameters: {}\n".format(
            sum(len(x.get_value(borrow=True).ravel()) for x in params)
        ))
Esempio n. 5
0
    def ready(self, args, train):
        # len * batch
        depth = args["depth"]
        self.args = args
        self.idxs = T.imatrix()
        self.idys = T.imatrix()
        self.init_state = [
            T.matrix(dtype=theano.config.floatX) for i in xrange(depth * 2)
        ]

        dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX)
        self.dropout = theano.shared(dropout_prob)
        rnn_dropout_prob = np.float64(args["rnn_dropout"]).astype(
            theano.config.floatX)
        self.rnn_dropout = theano.shared(rnn_dropout_prob)

        self.n_d = args["hidden_dim"]

        embedding_layer = EmbeddingLayer(n_d=self.n_d,
                                         vocab=set(w for w in train))
        self.n_V = embedding_layer.n_V

        say("Vocab size: {}\tHidden dim: {}\n".format(self.n_V, self.n_d))

        activation = get_activation_by_name(args["activation"])

        layers = self.layers = []
        for i in xrange(depth):
            rnn_layer = KernelNN(n_in=self.n_d,
                                 n_out=self.n_d,
                                 activation=activation,
                                 highway=args["highway"],
                                 dropout=self.rnn_dropout)
            layers.append(rnn_layer)

        output_layer = Layer(
            n_in=self.n_d,
            n_out=self.n_V,
            activation=T.nnet.softmax,
        )
        output_layer.W = embedding_layer.embeddings.T

        # (len*batch) * n_d
        x_flat = embedding_layer.forward(self.idxs.ravel())

        # len * batch * n_d
        x = apply_dropout(x_flat, self.dropout)
        #x = x_flat
        x = x.reshape((self.idxs.shape[0], self.idxs.shape[1], self.n_d))

        # len * batch * (n_d+n_d)
        self.last_state = []
        prev_h = x
        for i in xrange(depth):
            hidden = self.init_state[i * 2:i * 2 + 2]
            c, h = layers[i].forward_all(prev_h, hidden, return_c=True)
            self.last_state += [c[-1], h[-1]]
            prev_h = h

        prev_h = apply_dropout(prev_h, self.dropout)
        self.p_y_given_x = output_layer.forward(prev_h.reshape(x_flat.shape))

        idys = self.idys.ravel()
        self.nll = T.nnet.categorical_crossentropy(self.p_y_given_x, idys)

        self.params = [x for l in layers for x in l.params]
        self.params += [embedding_layer.embeddings, output_layer.b]
        self.num_params = sum(
            len(x.get_value(borrow=True).ravel()) for x in self.params)
        say("# of params in total: {}\n".format(self.num_params))
        layers += [embedding_layer, output_layer]
Esempio n. 6
0
    def ready(self):
        global total_generate_time
        #say("in generator ready: \n")
        #start_generate_time = time.time()
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))

        # len*batch
        x = self.x = T.imatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = []
        layer_type = args.layer.lower()
        for i in xrange(2):
            if layer_type == "rcnn":
                l = RCNN(n_in=n_e,
                         n_out=n_d,
                         activation=activation,
                         order=args.order)
            elif layer_type == "lstm":
                l = LSTM(n_in=n_e, n_out=n_d, activation=activation)

            l = Layer(n_in=n_e, n_out=n_d, activation=sigmoid)

            layers.append(l)

        # len * batch
        #masks = T.cast(T.neq(x, padding_id), theano.config.floatX)
        masks = T.cast(T.neq(x, padding_id), theano.config.floatX).dimshuffle(
            (0, 1, "x"))

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)
        self.word_embs = embs

        flipped_embs = embs[::-1]

        # len*bacth*n_d
        h1 = layers[0].forward(embs)
        h2 = layers[1].forward(flipped_embs)
        h_final = T.concatenate([h1, h2[::-1]], axis=2)
        h_final = apply_dropout(h_final, dropout)
        size = n_d * 2

        #size = n_e

        output_layer = self.output_layer = Layer(n_in=size,
                                                 n_out=1,
                                                 activation=sigmoid)

        # len*batch*1
        probs = output_layer.forward(h_final)
        #probs = output_layer.forward(embs)
        #probs1 = probs.reshape(x.shape)

        #probs_rev = output_layer.forward(flipped_embs)
        #probs1_rev = probs.reshape(x.shape)

        #probs = T.concatenate([probs1, probs1_rev[::-1]], axis=2)

        # len*batch
        probs2 = probs.reshape(x.shape)
        if self.args.seed is not None:
            self.MRG_rng = MRG_RandomStreams(self.args.seed)
        else:
            self.MRG_rng = MRG_RandomStreams()
        z_pred = self.z_pred = T.cast(
            self.MRG_rng.binomial(size=probs2.shape, p=probs2),
            theano.config.floatX)  #"int8")

        # we are computing approximated gradient by sampling z;
        # so should mark sampled z not part of the gradient propagation path
        #
        z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred)
        #self.sample_updates = sample_updates
        print "z_pred", z_pred.ndim

        z2 = z_pred.dimshuffle((0, 1, "x"))
        logpz = -T.nnet.binary_crossentropy(probs, z2) * masks
        logpz = self.logpz = logpz.reshape(x.shape)
        probs = self.probs = probs.reshape(x.shape)

        # batch
        z = z_pred
        self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
        self.zdiff = T.sum(T.abs_(z[1:] - z[:-1]),
                           axis=0,
                           dtype=theano.config.floatX)

        params = self.params = []
        for l in layers + [output_layer]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost
Esempio n. 7
0
    def ready(self):
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = theano.shared(
                np.float64(args.dropout).astype(theano.config.floatX)
            )

        # len*batch
        x = self.x = T.imatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = [ ]
        layer_type = args.layer.lower()
        for i in xrange(1):
            l = CNN(
                    n_in = n_e,
                    n_out = n_d,
                    activation = activation,
                    order = args.order
                )
            layers.append(l)

        # len * batch
        masks = T.cast(T.neq(x, padding_id), "int8").dimshuffle((0,1,'x'))

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)
        self.word_embs = embs

        # len*bacth*n_d
        h1 = layers[0].forward_all(embs)
        h_final = h1
        size = n_d
        h_final = apply_dropout(h_final, dropout)

        output_layer = self.output_layer = Layer(
                n_in = size,
                n_out = 1,
                activation = sigmoid
            )

        # len*batch*1
        probs = output_layer.forward(h_final)

        # len*batch
        self.MRG_rng = MRG_RandomStreams()
        z_pred_dim3 = self.MRG_rng.binomial(size=probs.shape, p=probs, dtype="int8")
        z_pred = z_pred_dim3.reshape(x.shape)

        # we are computing approximated gradient by sampling z;
        # so should mark sampled z not part of the gradient propagation path
        #
        z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred)
        print "z_pred", z_pred.ndim

        #logpz = - T.nnet.binary_crossentropy(probs, z_pred_dim3) * masks
        logpz = - T.nnet.binary_crossentropy(probs, z_pred_dim3)
        logpz = self.logpz = logpz.reshape(x.shape)
        probs = self.probs = probs.reshape(x.shape)

        # batch
        z = z_pred
        self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
        self.zdiff = T.sum(T.abs_(z[1:]-z[:-1]), axis=0, dtype=theano.config.floatX)

        params = self.params = [ ]
        for l in layers + [ output_layer ]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost
Esempio n. 8
0
    def ready(self):
        encoder = self.encoder
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = encoder.dropout

        # len*batch
        x = self.x = encoder.x
        z = self.z = encoder.z

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = []
        layer_type = args.layer.lower()
        for i in range(2):
            if layer_type == "rcnn":
                l = RCNN(
                    n_in=n_e,  # if i == 0 else n_d,
                    n_out=n_d,
                    activation=activation,
                    order=args.order)
            elif layer_type == "lstm":
                l = LSTM(
                    n_in=n_e,  # if i == 0 else n_d,
                    n_out=n_d,
                    activation=activation)
            layers.append(l)

        # len * batch
        #masks = T.cast(T.neq(x, padding_id), theano.config.floatX)
        masks = T.cast(T.neq(x, padding_id), "int8").dimshuffle((0, 1, "x"))

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)

        flipped_embs = embs[::-1]

        # len*bacth*n_d
        h1 = layers[0].forward_all(embs)
        h2 = layers[1].forward_all(flipped_embs)
        h_final = T.concatenate([h1, h2[::-1]], axis=2)
        h_final = apply_dropout(h_final, dropout)
        size = n_d * 2

        output_layer = self.output_layer = Layer(n_in=size,
                                                 n_out=1,
                                                 activation=sigmoid)

        # len*batch*1
        probs = output_layer.forward(h_final)

        # len*batch
        probs2 = probs.reshape(x.shape)
        self.MRG_rng = MRG_RandomStreams()
        z_pred = self.z_pred = T.cast(
            self.MRG_rng.binomial(size=probs2.shape, p=probs2), "int8")

        # we are computing approximated gradient by sampling z;
        # so should mark sampled z not part of the gradient propagation path
        #
        self.z_pred = theano.gradient.disconnected_grad(z_pred)

        z2 = z.dimshuffle((0, 1, "x"))
        logpz = -T.nnet.binary_crossentropy(probs, z2) * masks
        logpz = self.logpz = logpz.reshape(x.shape)
        probs = self.probs = probs.reshape(x.shape)

        # batch
        zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
        zdiff_pre = (z[1:] - z[:-1]) * 1.0
        zdiff = T.sum(abs(zdiff_pre), axis=0, dtype=theano.config.floatX)

        loss_mat = encoder.loss_mat
        if args.aspect < 0:
            loss_vec = T.mean(loss_mat, axis=1)
        else:
            assert args.aspect < self.nclasses
            loss_vec = loss_mat[:, args.aspect]
        self.loss_vec = loss_vec

        coherent_factor = args.sparsity * args.coherent
        loss = self.loss = T.mean(loss_vec)
        sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \
                                             T.mean(zdiff) * coherent_factor
        cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor
        cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0))
        self.obj = T.mean(cost_vec)

        params = self.params = []
        for l in layers + [output_layer]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg

        cost = self.cost = cost_logpz * 10 + l2_cost
        print("cost.dtype", cost.dtype)

        self.cost_e = loss * 10 + encoder.l2_cost
Esempio n. 9
0
    def ready(self):
        args = self.args
        embedding_layer = self.embedding_layer
        self.n_hidden = args.hidden_dim
        self.n_in = embedding_layer.n_d
        dropout = self.dropout = theano.shared(
            np.float64(args.dropout_rate).astype(theano.config.floatX))

        # x is length * batch_size
        # y is batch_size
        self.x = T.imatrix('x')
        self.y = T.ivector('y')

        x = self.x
        y = self.y
        n_hidden = self.n_hidden
        n_in = self.n_in

        # fetch word embeddings
        # (len * batch_size) * n_in
        slices = embedding_layer.forward(x.ravel())
        self.slices = slices

        # 3-d tensor, len * batch_size * n_in
        slices = slices.reshape((x.shape[0], x.shape[1], n_in))

        # stacking the feature extraction layers
        pooling = args.pooling
        depth = args.depth
        layers = self.layers = []
        prev_output = slices
        prev_output = apply_dropout(prev_output, dropout, v2=True)
        size = 0
        softmax_inputs = []
        activation = get_activation_by_name(args.act)
        for i in range(depth):
            if args.layer.lower() == "lstm":
                layer = LSTM(n_in=n_hidden if i > 0 else n_in, n_out=n_hidden)
            elif args.layer.lower() == "strcnn":
                layer = StrCNN(n_in=n_hidden if i > 0 else n_in,
                               n_out=n_hidden,
                               activation=activation,
                               decay=args.decay,
                               order=args.order)
            elif args.layer.lower() == "rcnn":
                layer = RCNN(n_in=n_hidden if i > 0 else n_in,
                             n_out=n_hidden,
                             activation=activation,
                             order=args.order,
                             mode=args.mode)
            else:
                raise Exception("unknown layer type: {}".format(args.layer))

            layers.append(layer)
            prev_output = layer.forward_all(prev_output)
            if pooling:
                softmax_inputs.append(T.sum(prev_output,
                                            axis=0))  # summing over columns
            else:
                softmax_inputs.append(prev_output[-1])
            prev_output = apply_dropout(prev_output, dropout)
            size += n_hidden

        # final feature representation is the concatenation of all extraction layers
        if pooling:
            softmax_input = T.concatenate(softmax_inputs, axis=1) / x.shape[0]
        else:
            softmax_input = T.concatenate(softmax_inputs, axis=1)
        softmax_input = apply_dropout(softmax_input, dropout, v2=True)

        # feed the feature repr. to the softmax output layer
        layers.append(
            Layer(n_in=size,
                  n_out=self.nclasses,
                  activation=softmax,
                  has_bias=False))

        for l, i in zip(layers, range(len(layers))):
            say("layer {}: n_in={}\tn_out={}\n".format(i, l.n_in, l.n_out))

        # unnormalized score of y given x
        self.p_y_given_x = layers[-1].forward(softmax_input)
        self.pred = T.argmax(self.p_y_given_x, axis=1)
        self.nll_loss = T.mean(
            T.nnet.categorical_crossentropy(self.p_y_given_x, y))

        # adding regularizations
        self.l2_sqr = None
        self.params = []
        for layer in layers:
            self.params += layer.params
        for p in self.params:
            if self.l2_sqr is None:
                self.l2_sqr = args.l2_reg * T.sum(p**2)
            else:
                self.l2_sqr += args.l2_reg * T.sum(p**2)

        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                        for x in self.params)
        say("total # parameters: {}\n".format(nparams))
Esempio n. 10
0
    def ready(self):
        args = self.args
        weights = self.weights

        # len(title) * batch
        idts = self.idts = T.imatrix()

        # len(body) * batch
        idbs = self.idbs = T.imatrix()

        # num pairs * 3, or num queries * candidate size
        idps = self.idps = T.imatrix()

        dropout = self.dropout = theano.shared(np.float64(args.dropout).astype(
                            theano.config.floatX))
        dropout_op = self.dropout_op = Dropout(self.dropout)

        embedding_layer = self.embedding_layer

        activation = get_activation_by_name(args.activation)
        n_d = self.n_d = args.hidden_dim
        n_e = self.n_e = embedding_layer.n_d

        if args.layer.lower() == "rcnn":
            LayerType = RCNN
        elif args.layer.lower() == "lstm":
            LayerType = LSTM
        elif args.layer.lower() == "gru":
            LayerType = GRU

        depth = self.depth = args.depth
        layers = self.layers = [ ]
        for i in range(depth):
            if LayerType != RCNN:
                feature_layer = LayerType(
                        n_in = n_e if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation
                    )
            else:
                feature_layer = LayerType(
                        n_in = n_e if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation,
                        order = args.order,
                        mode = args.mode,
                        has_outgate = args.outgate
                    )
            layers.append(feature_layer)

        # feature computation starts here

        # (len*batch)*n_e
        xt = embedding_layer.forward(idts.ravel())
        if weights is not None:
            xt_w = weights[idts.ravel()].dimshuffle((0,'x'))
            xt = xt * xt_w

        # len*batch*n_e
        xt = xt.reshape((idts.shape[0], idts.shape[1], n_e))
        xt = apply_dropout(xt, dropout)

        # (len*batch)*n_e
        xb = embedding_layer.forward(idbs.ravel())
        if weights is not None:
            xb_w = weights[idbs.ravel()].dimshuffle((0,'x'))
            xb = xb * xb_w

        # len*batch*n_e
        xb = xb.reshape((idbs.shape[0], idbs.shape[1], n_e))
        xb = apply_dropout(xb, dropout)

        prev_ht = self.xt = xt
        prev_hb = self.xb = xb
        for i in range(depth):
            # len*batch*n_d
            ht = layers[i].forward_all(prev_ht)
            hb = layers[i].forward_all(prev_hb)
            prev_ht = ht
            prev_hb = hb

        # normalize vectors
        if args.normalize:
            ht = self.normalize_3d(ht)
            hb = self.normalize_3d(hb)
            say("h_title dtype: {}\n".format(ht.dtype))

        self.ht = ht
        self.hb = hb

        # average over length, ignore paddings
        # batch * d
        if args.average:
            ht = self.average_without_padding(ht, idts)
            hb = self.average_without_padding(hb, idbs)
        else:
            ht = ht[-1]
            hb = hb[-1]
        say("h_avg_title dtype: {}\n".format(ht.dtype))

        # batch * d
        h_final = (ht+hb)*0.5
        h_final = apply_dropout(h_final, dropout)
        h_final = self.normalize_2d(h_final)
        self.h_final = h_final
        say("h_final dtype: {}\n".format(ht.dtype))

        # For testing:
        #   first one in batch is query, the rest are candidate questions
        self.scores = T.dot(h_final[1:], h_final[0])

        # For training:
        xp = h_final[idps.ravel()]
        xp = xp.reshape((idps.shape[0], idps.shape[1], n_d))
        # num query * n_d
        query_vecs = xp[:,0,:]
        # num query
        pos_scores = T.sum(query_vecs*xp[:,1,:], axis=1)
        # num query * candidate size
        neg_scores = T.sum(query_vecs.dimshuffle((0,'x',1))*xp[:,2:,:], axis=2)
        # num query
        neg_scores = T.max(neg_scores, axis=1)
        diff = neg_scores - pos_scores + 1.0
        loss = T.mean( (diff>0)*diff )
        self.loss = loss

        params = [ ]
        for l in self.layers:
            params += l.params
        self.params = params
        say("num of parameters: {}\n".format(
            sum(len(x.get_value(borrow=True).ravel()) for x in params)
        ))

        l2_reg = None
        for p in params:
            if l2_reg is None:
                l2_reg = p.norm(2)
            else:
                l2_reg = l2_reg + p.norm(2)
        l2_reg = l2_reg * args.l2_reg
        self.cost = self.loss + l2_reg
Esempio n. 11
0
    def ready(self):
        args = self.args
        embedding_layer = self.embedding_layer
        user_embedding_layer = self.user_embedding_layer
        self.n_hidden = args.hidden_dim
        self.n_in = embedding_layer.n_d
        dropout = self.dropout = theano.shared(
            np.float64(args.dropout_rate).astype(theano.config.floatX)
        )

        # x is length * batch_size
        # y is batch_size
        self.x = T.imatrix('x')
        self.w_masks = T.fmatrix('mask')
        self.w_lens = T.fvector('lens')
        self.s_ml = T.iscalar('sent_maxlen')
        self.s_num = T.iscalar('sent_num')
        self.y = T.ivector('y')
        self.usr = T.ivector('users')

        x = self.x
        y = self.y
        usr = self.usr
        w_masks = self.w_masks
        w_lens = self.w_lens
        s_ml = self.s_ml
        s_num = self.s_num
        n_hidden = self.n_hidden
        n_emb = n_in = self.n_in

        layers = self.layers = []

        slicesu = user_embedding_layer.forward(usr)
        slices = embedding_layer.forward(x.ravel())
        self.slices = slices  # important for updating word embeddings

        # 3-d tensor, len * batch_size * n_in
        slices = slices.reshape((x.shape[0], x.shape[1], n_in))

        pooling = args.pooling
        prev_output = slices
        prev_output = apply_dropout(prev_output, dropout, v2=True)
        size = 0

        n_hidden_t = n_hidden
        if args.direction == "bi":
            n_hidden_t = 2 * n_hidden

        softmax_inputs = []
        activation = get_activation_by_name(args.act)

        if args.layer.lower() == "lstm":
            layer = LSTM(n_in=n_in,
                         n_out=n_hidden_t,
                         direction=args.direction
                         )
        elif args.layer.lower() == "cnn":
            layer = CNN(n_in=n_in,
                        n_out=n_hidden_t,
                        activation=activation,
                        order=args.order
                        )
        else:
            raise Exception("unknown layer type: {}".format(args.layer))

        layers.append(layer)
        prev_output = layer.forward_all(prev_output, masks=w_masks)
        prev_output = apply_dropout(prev_output, dropout)

        # final feature representation is the concatenation of all extraction layers
        if args.user_atten:
            layer = IterAttentionLayer(
                n_in=n_emb,
                n_out=n_hidden_t
            )
            layers.append(layer)
            if args.user_atten_base:
                slicesu = None
            softmax_input = layers[-1].multi_hop_forward(
                prev_output, user_embs=slicesu, isWord=True, masks=w_masks)
        else:
            if pooling:
                softmax_input = T.sum(prev_output, axis=0) / w_lens.dimshuffle(0, 'x')
            else:
                ind = T.cast(w_lens - T.ones_like(w_lens), 'int32')
                softmax_input = prev_output[T.arange(ind.shape[0]), ind]

        softmax_input = apply_dropout(softmax_input, dropout, v2=True)

        n_in = n_hidden_t
        size = 0
        softmax_inputs = []
        [sentlen, emblen] = T.shape(softmax_input)
        prev_output = softmax_input.reshape(
            (sentlen / s_num, s_num, emblen)).dimshuffle(1, 0, 2)
        if args.layer.lower() == "lstm":
            layer = LSTM(n_in=n_in,
                         n_out=n_hidden_t,
                         direction=args.direction
                         )
        elif args.layer.lower() == "cnn":
            layer = CNN(n_in=n_in,
                        n_out=n_hidden_t,
                        activation=activation,
                        order=args.order,
                        )
        else:
            raise Exception("unknown layer type: {}".format(args.layer))

        layers.append(layer)
        prev_output = layer.forward_all(prev_output)
        prev_output = apply_dropout(prev_output, dropout)

        if args.user_atten:
            layer = IterAttentionLayer(
                n_in=n_emb,
                n_out=n_hidden_t
            )
            layers.append(layer)

            if args.user_atten_base:
                slicesu = None
            softmax_input = layers[-1].multi_hop_forward(
                prev_output, user_embs=slicesu, isWord=False)
        else:
            if pooling:
                softmax_input = T.sum(prev_output, axis=0) / \
                    T.cast(s_num, 'float32')
            else:
                softmax_input = prev_output[-1]
        softmax_input = apply_dropout(softmax_input, dropout, v2=True)

        size = n_hidden_t
        layers.append(Layer(
            n_in=size,
            n_out=self.nclasses,
            activation=softmax,
            has_bias=False
        ))
        if not args.fix_emb:
            for l, i in zip(layers, range(len(layers))):
                say("layer {}: n_in={}\tn_out={}\n".format(
                    i, l.n_in, l.n_out
                ))
        else:
            for l, i in zip(layers[1:], range(len(layers[1:]))):
                say("layer {}: n_in={}\tn_out={}\n".format(
                    i, l.n_in, l.n_out
                ))

        # unnormalized score of y given x
        self.p_y_given_x = layers[-1].forward(softmax_input)
        self.pred = T.argmax(self.p_y_given_x, axis=1)
        self.nll_loss = T.mean(T.nnet.categorical_crossentropy(
            self.p_y_given_x,
            y
        ))

        # adding regularizations
        self.l2_sqr = None
        self.params = []
        for layer in layers:
            self.params += layer.params
        for p in self.params:
            if self.l2_sqr is None:
                self.l2_sqr = args.l2_reg * T.sum(p**2)
            else:
                self.l2_sqr += args.l2_reg * T.sum(p**2)

        nparams = sum(len(x.get_value(borrow=True).ravel())
                      for x in self.params)
        say("total # parameters: {}\n".format(nparams))
Esempio n. 12
0
    def ready(self):
        encoder = self.encoder
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = encoder.dropout

        # len*batch
        x = self.x = encoder.x
        z = self.z = encoder.z

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = [ ]
        layer_type = args.layer.lower()
        for i in xrange(2):
            if layer_type == "rcnn":
                l = RCNN(
                        n_in = n_e,# if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation,
                        order = args.order
                    )
            elif layer_type == "lstm":
                l = LSTM(
                        n_in = n_e,# if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation
                    )
            layers.append(l)

        # len * batch
        #masks = T.cast(T.neq(x, padding_id), theano.config.floatX)
        masks = T.cast(T.neq(x, padding_id), "int8").dimshuffle((0,1,"x"))

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)

        flipped_embs = embs[::-1]

        # len*bacth*n_d
        h1 = layers[0].forward_all(embs)
        h2 = layers[1].forward_all(flipped_embs)
        h_final = T.concatenate([h1, h2[::-1]], axis=2)
        h_final = apply_dropout(h_final, dropout)
        size = n_d * 2

        output_layer = self.output_layer = Layer(
                n_in = size,
                n_out = 1,
                activation = sigmoid
            )

        # len*batch*1
        probs = output_layer.forward(h_final)

        # len*batch
        probs2 = probs.reshape(x.shape)
        self.MRG_rng = MRG_RandomStreams()
        z_pred = self.z_pred = T.cast(self.MRG_rng.binomial(size=probs2.shape, p=probs2), "int8")

        # we are computing approximated gradient by sampling z;
        # so should mark sampled z not part of the gradient propagation path
        #
        self.z_pred = theano.gradient.disconnected_grad(z_pred)

        z2 = z.dimshuffle((0,1,"x"))
        logpz = - T.nnet.binary_crossentropy(probs, z2) * masks
        logpz = self.logpz = logpz.reshape(x.shape)
        probs = self.probs = probs.reshape(x.shape)

        # batch
        zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
        zdiff = T.sum(T.abs_(z[1:]-z[:-1]), axis=0, dtype=theano.config.floatX)

        loss_mat = encoder.loss_mat
        if args.aspect < 0:
            loss_vec = T.mean(loss_mat, axis=1)
        else:
            assert args.aspect < self.nclasses
            loss_vec = loss_mat[:,args.aspect]
        self.loss_vec = loss_vec

        coherent_factor = args.sparsity * args.coherent
        loss = self.loss = T.mean(loss_vec)
        sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \
                                             T.mean(zdiff) * coherent_factor
        cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor
        cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0))
        self.obj = T.mean(cost_vec)

        params = self.params = [ ]
        for l in layers + [ output_layer ]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg

        cost = self.cost = cost_logpz * 10 + l2_cost
        print "cost.dtype", cost.dtype

        self.cost_e = loss * 10 + encoder.l2_cost
Esempio n. 13
0
    def ready(self):
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = theano.shared(
                np.float64(args.dropout).astype(theano.config.floatX)
            )

        # len*batch
        x = self.x = T.imatrix()

        z = self.z = T.bmatrix()
        z = z.dimshuffle((0,1,"x"))

        # batch*nclasses
        y = self.y = T.fmatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = [ ]
        depth = args.depth
        layer_type = args.layer.lower()
        for i in xrange(depth):
            if layer_type == "rcnn":
                l = ExtRCNN(
                        n_in = n_e if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation,
                        order = args.order
                    )
            elif layer_type == "lstm":
                l = ExtLSTM(
                        n_in = n_e if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation
                    )
            layers.append(l)

        # len * batch * 1
        masks = T.cast(T.neq(x, padding_id).dimshuffle((0,1,"x")) * z, theano.config.floatX)
        # batch * 1
        cnt_non_padding = T.sum(masks, axis=0) + 1e-8

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)

        pooling = args.pooling
        lst_states = [ ]
        h_prev = embs
        for l in layers:
            # len*batch*n_d
            h_next = l.forward_all(h_prev, z)
            if pooling:
                # batch * n_d
                masked_sum = T.sum(h_next * masks, axis=0)
                lst_states.append(masked_sum/cnt_non_padding) # mean pooling
            else:
                lst_states.append(h_next[-1]) # last state
            h_prev = apply_dropout(h_next, dropout)

        if args.use_all:
            size = depth * n_d
            # batch * size (i.e. n_d*depth)
            h_final = T.concatenate(lst_states, axis=1)
        else:
            size = n_d
            h_final = lst_states[-1]
        h_final = apply_dropout(h_final, dropout)

        output_layer = self.output_layer = Layer(
                n_in = size,
                n_out = self.nclasses,
                activation = sigmoid
            )

        # batch * nclasses
        preds = self.preds = output_layer.forward(h_final)

        # batch
        loss_mat = self.loss_mat = (preds-y)**2
        loss = self.loss = T.mean(loss_mat)

        pred_diff = self.pred_diff = T.mean(T.max(preds, axis=1) - T.min(preds, axis=1))

        params = self.params = [ ]
        for l in layers + [ output_layer ]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost

        cost = self.cost = loss * 10 + l2_cost
Esempio n. 14
0
    def _initialize_encoder_graph(self):

        with tf.name_scope('embeddings'):
            self.titles = tf.nn.embedding_lookup(
                self.embeddings, self.titles_words_ids_placeholder)
            self.bodies = tf.nn.embedding_lookup(
                self.embeddings, self.bodies_words_ids_placeholder)

            if self.weights is not None:
                titles_weights = tf.nn.embedding_lookup(
                    self.weights, self.titles_words_ids_placeholder)
                titles_weights = tf.expand_dims(titles_weights, axis=2)
                self.titles = self.titles * titles_weights

                bodies_weights = tf.nn.embedding_lookup(
                    self.weights, self.bodies_words_ids_placeholder)
                bodies_weights = tf.expand_dims(bodies_weights, axis=2)
                self.bodies = self.bodies * bodies_weights

            self.titles = tf.nn.dropout(self.titles, 1.0 - self.dropout_prob)
            self.bodies = tf.nn.dropout(self.bodies, 1.0 - self.dropout_prob)

        with tf.name_scope('CNN'):
            print 'ignoring depth at the moment !!'
            self.embedded_titles_expanded = tf.expand_dims(self.titles, -1)
            self.embedded_bodies_expanded = tf.expand_dims(self.bodies, -1)
            # TensorFlow's convolutional conv2d operation expects a 4-dimensional
            # tensor with dimensions corresponding to batch, width, height and channel
            # The result of our embedding doesn't contain the channel dimension,
            # so we add it manually, leaving us with a layer of
            # shape [batch/None, sequence_length, embedding_size, 1].

            # So, each element of the word vector is a list of size 1 instead of a real number.

            # CONVOLUTION AND MAXPOOLING
            pooled_outputs_t = []
            pooled_outputs_b = []
            filter_sizes = [3]
            for i, filter_size in enumerate(filter_sizes):
                with tf.name_scope("conv-maxpool-%s" % filter_size):
                    # Convolution Layer
                    filter_shape = [
                        filter_size, self.embedding_layer.n_d, 1,
                        self.args.hidden_dim
                    ]
                    print 'assuming num filters = hidden dim. IS IT CORRECT? '

                    w_vals, b_vals = init_w_b_vals(filter_shape,
                                                   [self.args.hidden_dim],
                                                   self.args.activation)
                    W = tf.Variable(w_vals, name="conv-W")
                    b = tf.Variable(b_vals, name="conv-b")
                    # self.W = W

                    with tf.name_scope('titles_output'):
                        conv_t = tf.nn.conv2d(
                            self.embedded_titles_expanded,
                            W,
                            strides=[
                                1, 1, 1, 1
                            ],  # how much the window shifts by in each of the dimensions.
                            padding="VALID",
                            name="conv-titles")

                        # Apply nonlinearity
                        nl_fun = get_activation_by_name(self.args.activation)
                        h_t = nl_fun(tf.nn.bias_add(conv_t, b),
                                     name="act-titles")

                        if self.args.average:
                            pooled_t = tf.reduce_mean(h_t,
                                                      axis=1,
                                                      keep_dims=True)
                        else:
                            pooled_t = tf.reduce_max(h_t,
                                                     axis=1,
                                                     keep_dims=True)

                        # self.pooled_t = pooled_t
                        pooled_outputs_t.append(pooled_t)

                    with tf.name_scope('bodies_output'):
                        conv_b = tf.nn.conv2d(self.embedded_bodies_expanded,
                                              W,
                                              strides=[1, 1, 1, 1],
                                              padding="VALID",
                                              name="conv-bodies")
                        # self.conv_b = conv_b

                        nl_fun = get_activation_by_name(self.args.activation)
                        h_b = nl_fun(tf.nn.bias_add(conv_b, b),
                                     name="act-bodies")

                        if self.args.average:
                            pooled_b = tf.reduce_mean(h_b,
                                                      axis=1,
                                                      keep_dims=True)
                        else:
                            pooled_b = tf.reduce_max(h_b,
                                                     axis=1,
                                                     keep_dims=True)

                        # self.pooled_b = pooled_b
                        pooled_outputs_b.append(pooled_b)

            # Combine all the pooled features
            num_filters_total = self.args.hidden_dim * len(filter_sizes)
            self.t_pool = tf.concat(pooled_outputs_t, 3)
            self.t_state = tf.reshape(self.t_pool, [-1, num_filters_total])
            # reshape so that we have shape [batch, num_features_total]

            self.b_pool = tf.concat(pooled_outputs_b, 3)
            self.b_state = tf.reshape(self.b_pool, [-1, num_filters_total])

        with tf.name_scope('outputs'):
            # batch * d
            h_final = (self.t_state + self.b_state) * 0.5
            h_final = tf.nn.dropout(h_final, 1.0 - self.dropout_prob)
            self.h_final = self.normalize_2d(h_final)
Esempio n. 15
0
 def lstm_cell():
     _cell = tf.nn.rnn_cell.LSTMCell(
         self.args.hidden_dim,
         state_is_tuple=True,
         activation=get_activation_by_name(self.args.activation))
     return _cell
Esempio n. 16
0
    def ready(self):
        args = self.args
        weights = self.weights

        # len(source) * batch
        idxs = self.idxs = T.imatrix()

        # len(target) * batch
        idys = self.idys = T.imatrix()
        idts = idys[:-1]
        idgs = idys[1:]

        dropout = self.dropout = theano.shared(np.float64(args.dropout).astype(
                            theano.config.floatX))

        embedding_layer = self.embedding_layer

        activation = get_activation_by_name(args.activation)
        n_d = self.n_d = args.hidden_dim
        n_e = self.n_e = embedding_layer.n_d
        n_V = self.n_V = embedding_layer.n_V

        if args.layer.lower() == "rcnn":
            LayerType = RCNN
        elif args.layer.lower() == "lstm":
            LayerType = LSTM
        elif args.layer.lower() == "gru":
            LayerType = GRU

        depth = self.depth = args.depth
        layers = self.layers = [ ]
        for i in range(depth*2):
            if LayerType != RCNN:
                feature_layer = LayerType(
                        n_in = n_e if i/2 == 0 else n_d,
                        n_out = n_d,
                        activation = activation
                    )
            else:
                feature_layer = LayerType(
                        n_in = n_e if i/2 == 0 else n_d,
                        n_out = n_d,
                        activation = activation,
                        order = args.order,
                        mode = args.mode,
                        has_outgate = args.outgate
                    )
            layers.append(feature_layer)

        self.output_layer = output_layer = Layer(
                n_in = n_d,
                n_out = n_V,
                activation = T.nnet.softmax,
            )

        # feature computation starts here

        # (len*batch)*n_e
        xs_flat = embedding_layer.forward(idxs.ravel())
        xs_flat = apply_dropout(xs_flat, dropout)
        if weights is not None:
            xs_w = weights[idxs.ravel()].dimshuffle((0,'x'))
            xs_flat = xs_flat * xs_w
        # len*batch*n_e
        xs = xs_flat.reshape((idxs.shape[0], idxs.shape[1], n_e))

        # (len*batch)*n_e
        xt_flat = embedding_layer.forward(idts.ravel())
        xt_flat = apply_dropout(xt_flat, dropout)
        if weights is not None:
            xt_w = weights[idts.ravel()].dimshuffle((0,'x'))
            xt_flat = xt_flat * xt_w
        # len*batch*n_e
        xt = xt_flat.reshape((idts.shape[0], idts.shape[1], n_e))

        prev_hs = xs
        prev_ht = xt
        for i in range(depth):
            # len*batch*n_d
            hs = layers[i*2].forward_all(prev_hs, return_c=True)
            ht = layers[i*2+1].forward_all(prev_ht, hs[-1])
            hs = hs[:,:,-n_d:]
            ht = ht[:,:,-n_d:]
            prev_hs = hs
            prev_ht = ht
            prev_hs = apply_dropout(hs, dropout)
            prev_ht = apply_dropout(ht, dropout)

        self.p_y_given_x = output_layer.forward(prev_ht.reshape(
                                (xt_flat.shape[0], n_d)
                            ))

        h_final = hs[-1]
        self.scores2 = -(h_final[1:]-h_final[0]).norm(2,axis=1)
        h_final = self.normalize_2d(h_final)
        self.scores = T.dot(h_final[1:], h_final[0])

        # (len*batch)
        nll = T.nnet.categorical_crossentropy(
                        self.p_y_given_x,
                        idgs.ravel()
                    )
        nll = nll.reshape(idgs.shape)
        self.nll = nll
        self.mask = mask = T.cast(T.neq(idgs, self.padding_id), theano.config.floatX)
        nll = T.sum(nll*mask, axis=0)

        #layers.append(embedding_layer)
        layers.append(output_layer)
        params = [ ]
        for l in self.layers:
            params += l.params
        self.params = params
        say("num of parameters: {}\n".format(
            sum(len(x.get_value(borrow=True).ravel()) for x in params)
        ))

        l2_reg = None
        for p in params:
            if l2_reg is None:
                l2_reg = p.norm(2)
            else:
                l2_reg = l2_reg + p.norm(2)
        l2_reg = l2_reg * args.l2_reg
        self.loss = T.mean(nll)
        self.cost = self.loss + l2_reg
Esempio n. 17
0
    def ready(self):
        args = self.args
        embedding_layer = self.embedding_layer
        self.n_hidden = args.hidden_dim
        self.n_in = embedding_layer.n_d
        dropout = self.dropout = theano.shared(
                np.float64(args.dropout_rate).astype(theano.config.floatX)
            )

        # x is length * batch_size
        # y is batch_size
        self.x = T.imatrix('x')
        self.y = T.ivector('y')

        x = self.x
        y = self.y
        n_hidden = self.n_hidden
        n_in = self.n_in

        # fetch word embeddings
        # (len * batch_size) * n_in
        slices  = embedding_layer.forward(x.ravel())
        self.slices = slices

        # 3-d tensor, len * batch_size * n_in
        slices = slices.reshape( (x.shape[0], x.shape[1], n_in) )

        # stacking the feature extraction layers
        pooling = args.pooling
        depth = args.depth
        layers = self.layers = [ ]
        prev_output = slices
        prev_output = apply_dropout(prev_output, dropout, v2=True)
        size = 0
        softmax_inputs = [ ]
        activation = get_activation_by_name(args.act)
        for i in range(depth):
            if args.layer.lower() == "lstm":
                layer = LSTM(
                            n_in = n_hidden if i > 0 else n_in,
                            n_out = n_hidden
                        )
            elif args.layer.lower() == "strcnn":
                layer = StrCNN(
                            n_in = n_hidden if i > 0 else n_in,
                            n_out = n_hidden,
                            activation = activation,
                            decay = args.decay,
                            order = args.order
                        )
            elif args.layer.lower() == "rcnn":
                layer = RCNN(
                            n_in = n_hidden if i > 0 else n_in,
                            n_out = n_hidden,
                            activation = activation,
                            order = args.order,
                            mode = args.mode
                        )
            else:
                raise Exception("unknown layer type: {}".format(args.layer))

            layers.append(layer)
            prev_output = layer.forward_all(prev_output)
            if pooling:
                softmax_inputs.append(T.sum(prev_output, axis=0)) # summing over columns
            else:
                softmax_inputs.append(prev_output[-1])
            prev_output = apply_dropout(prev_output, dropout)
            size += n_hidden

        # final feature representation is the concatenation of all extraction layers
        if pooling:
            softmax_input = T.concatenate(softmax_inputs, axis=1) / x.shape[0]
        else:
            softmax_input = T.concatenate(softmax_inputs, axis=1)
        softmax_input = apply_dropout(softmax_input, dropout, v2=True)

        # feed the feature repr. to the softmax output layer
        layers.append( Layer(
                n_in = size,
                n_out = self.nclasses,
                activation = softmax,
                has_bias = False
        ) )

        for l,i in zip(layers, range(len(layers))):
            say("layer {}: n_in={}\tn_out={}\n".format(
                i, l.n_in, l.n_out
            ))

        # unnormalized score of y given x
        self.p_y_given_x = layers[-1].forward(softmax_input)
        self.pred = T.argmax(self.p_y_given_x, axis=1)
        self.nll_loss = T.mean( T.nnet.categorical_crossentropy(
                                    self.p_y_given_x,
                                    y
                            ))

        # adding regularizations
        self.l2_sqr = None
        self.params = [ ]
        for layer in layers:
            self.params += layer.params
        for p in self.params:
            if self.l2_sqr is None:
                self.l2_sqr = args.l2_reg * T.sum(p**2)
            else:
                self.l2_sqr += args.l2_reg * T.sum(p**2)

        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                        for x in self.params)
        say("total # parameters: {}\n".format(nparams))
Esempio n. 18
0
    def ready(self):
        generator = self.generator
        args = self.args
        weights = self.weights

        dropout = generator.dropout

        # len(text) * batch
        idts = generator.x
        z = generator.z_pred
        z = z.dimshuffle((0, 1, "x"))

        # batch * 2
        pairs = self.pairs = T.imatrix()

        # num pairs * 3, or num queries * candidate size
        triples = self.triples = T.imatrix()

        embedding_layer = self.embedding_layer

        activation = get_activation_by_name(args.activation)
        n_d = self.n_d = args.hidden_dim
        n_e = self.n_e = embedding_layer.n_d

        if args.layer.lower() == "rcnn":
            LayerType = RCNN
            LayerType2 = ExtRCNN
        elif args.layer.lower() == "lstm":
            LayerType = LSTM
            LayerType2 = ExtLSTM
        #elif args.layer.lower() == "gru":
        #    LayerType = GRU

        depth = self.depth = args.depth
        layers = self.layers = []
        for i in range(depth):
            if LayerType != RCNN:
                feature_layer = LayerType(n_in=n_e if i == 0 else n_d,
                                          n_out=n_d,
                                          activation=activation)
            else:
                feature_layer = LayerType(n_in=n_e if i == 0 else n_d,
                                          n_out=n_d,
                                          activation=activation,
                                          order=args.order,
                                          mode=args.mode,
                                          has_outgate=args.outgate)
            layers.append(feature_layer)

        extlayers = self.extlayers = []
        for i in range(depth):
            if LayerType != RCNN:
                feature_layer = LayerType2(n_in=n_e if i == 0 else n_d,
                                           n_out=n_d,
                                           activation=activation)
            else:
                feature_layer = LayerType2(n_in=n_e if i == 0 else n_d,
                                           n_out=n_d,
                                           activation=activation,
                                           order=args.order,
                                           mode=args.mode,
                                           has_outgate=args.outgate)
            feature_layer.copy_params(layers[i])
            extlayers.append(feature_layer)

        # feature computation starts here

        xt = generator.word_embs

        # encode full text into representation
        prev_ht = self.xt = xt
        for i in range(depth):
            # len*batch*n_d
            ht = layers[i].forward_all(prev_ht)
            prev_ht = ht

        # encode selected text into representation
        prev_htz = self.xt = xt
        for i in range(depth):
            # len*batch*n_d
            htz = extlayers[i].forward_all(prev_htz, z)
            prev_htz = htz

        # normalize vectors
        if args.normalize:
            ht = self.normalize_3d(ht)
            htz = self.normalize_3d(htz)
            say("h_title dtype: {}\n".format(ht.dtype))

        self.ht = ht
        self.htz = htz

        # average over length, ignore paddings
        # batch * d
        if args.average:
            ht = self.average_without_padding(ht, idts)
            htz = self.average_without_padding(htz, idts, z)
        else:
            ht = ht[-1]
            htz = htz[-1]
        say("h_avg_title dtype: {}\n".format(ht.dtype))

        # batch * d
        h_final = apply_dropout(ht, dropout)
        h_final = self.normalize_2d(h_final)
        hz_final = apply_dropout(htz, dropout)
        hz_final = self.normalize_2d(hz_final)
        self.h_final = h_final
        self.hz_final = hz_final

        say("h_final dtype: {}\n".format(ht.shape))

        # For testing:
        #   first one in batch is query, the rest are candidate questions
        self.scores = T.dot(h_final[1:], h_final[0])
        self.scores_z = T.dot(hz_final[1:], hz_final[0])

        # For training encoder:
        xp = h_final[triples.ravel()]
        xp = xp.reshape((triples.shape[0], triples.shape[1], n_d))
        # num query * n_d
        query_vecs = xp[:, 0, :]
        # num query
        pos_scores = T.sum(query_vecs * xp[:, 1, :], axis=1)
        # num query * candidate size
        neg_scores = T.sum(query_vecs.dimshuffle((0, 'x', 1)) * xp[:, 2:, :],
                           axis=2)
        # num query
        neg_scores = T.max(neg_scores, axis=1)
        diff = neg_scores - pos_scores + 1.0
        hinge_loss = T.mean((diff > 0) * diff)

        # For training generator

        # batch
        self_cosine_distance = 1.0 - T.sum(hz_final * h_final, axis=1)
        pair_cosine_distance = 1.0 - T.sum(hz_final * h_final[pairs[:, 1]],
                                           axis=1)
        alpha = args.alpha
        loss_vec = self_cosine_distance * alpha + pair_cosine_distance * (
            1 - alpha)
        #loss_vec = self_cosine_distance*0.2 + pair_cosine_distance*0.8

        zsum = generator.zsum
        zdiff = generator.zdiff
        logpz = generator.logpz

        sfactor = args.sparsity
        cfactor = args.sparsity * args.coherent
        scost_vec = zsum * sfactor + zdiff * cfactor

        # batch
        cost_vec = loss_vec + scost_vec
        cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0))
        loss = self.loss = T.mean(loss_vec)
        sparsity_cost = self.sparsity_cost = T.mean(scost_vec)
        self.obj = loss + sparsity_cost

        params = []
        for l in self.layers:
            params += l.params
        self.params = params
        say("num of parameters: {}\n".format(
            sum(len(x.get_value(borrow=True).ravel()) for x in params)))

        l2_reg = None
        for p in params:
            if l2_reg is None:
                l2_reg = T.sum(p**2)  #p.norm(2)
            else:
                l2_reg = l2_reg + T.sum(p**2)  #p.norm(2)
        l2_reg = l2_reg * args.l2_reg
        self.l2_cost = l2_reg

        beta = args.beta
        self.cost_g = cost_logpz + generator.l2_cost
        self.cost_e = hinge_loss + loss * beta + l2_reg
Esempio n. 19
0
    def ready(self):
        args = self.args
        weights = self.weights

        # len(source) * batch
        idxs = self.idxs = T.imatrix()

        # len(target) * batch
        idys = self.idys = T.imatrix()
        idts = idys[:-1]
        idgs = idys[1:]

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))

        embedding_layer = self.embedding_layer

        activation = get_activation_by_name(args.activation)
        n_d = self.n_d = args.hidden_dim
        n_e = self.n_e = embedding_layer.n_d
        n_V = self.n_V = embedding_layer.n_V

        if args.layer.lower() == "rcnn":
            LayerType = RCNN
        elif args.layer.lower() == "lstm":
            LayerType = LSTM
        elif args.layer.lower() == "gru":
            LayerType = GRU

        depth = self.depth = args.depth
        layers = self.layers = []
        for i in range(depth * 2):
            if LayerType != RCNN:
                feature_layer = LayerType(n_in=n_e if i / 2 == 0 else n_d,
                                          n_out=n_d,
                                          activation=activation)
            else:
                feature_layer = LayerType(n_in=n_e if i / 2 == 0 else n_d,
                                          n_out=n_d,
                                          activation=activation,
                                          order=args.order,
                                          mode=args.mode,
                                          has_outgate=args.outgate)
            layers.append(feature_layer)

        self.output_layer = output_layer = Layer(
            n_in=n_d,
            n_out=n_V,
            activation=T.nnet.softmax,
        )

        # feature computation starts here

        # (len*batch)*n_e
        xs_flat = embedding_layer.forward(idxs.ravel())
        xs_flat = apply_dropout(xs_flat, dropout)
        if weights is not None:
            xs_w = weights[idxs.ravel()].dimshuffle((0, 'x'))
            xs_flat = xs_flat * xs_w
        # len*batch*n_e
        xs = xs_flat.reshape((idxs.shape[0], idxs.shape[1], n_e))

        # (len*batch)*n_e
        xt_flat = embedding_layer.forward(idts.ravel())
        xt_flat = apply_dropout(xt_flat, dropout)
        if weights is not None:
            xt_w = weights[idts.ravel()].dimshuffle((0, 'x'))
            xt_flat = xt_flat * xt_w
        # len*batch*n_e
        xt = xt_flat.reshape((idts.shape[0], idts.shape[1], n_e))

        prev_hs = xs
        prev_ht = xt
        for i in range(depth):
            # len*batch*n_d
            hs = layers[i * 2].forward_all(prev_hs, return_c=True)
            ht = layers[i * 2 + 1].forward_all(prev_ht, hs[-1])
            hs = hs[:, :, -n_d:]
            ht = ht[:, :, -n_d:]
            prev_hs = hs
            prev_ht = ht
            prev_hs = apply_dropout(hs, dropout)
            prev_ht = apply_dropout(ht, dropout)

        self.p_y_given_x = output_layer.forward(
            prev_ht.reshape((xt_flat.shape[0], n_d)))

        h_final = hs[-1]
        self.scores2 = -(h_final[1:] - h_final[0]).norm(2, axis=1)
        h_final = self.normalize_2d(h_final)
        self.scores = T.dot(h_final[1:], h_final[0])

        # (len*batch)
        nll = T.nnet.categorical_crossentropy(self.p_y_given_x, idgs.ravel())
        nll = nll.reshape(idgs.shape)
        self.nll = nll
        self.mask = mask = T.cast(T.neq(idgs, self.padding_id),
                                  theano.config.floatX)
        nll = T.sum(nll * mask, axis=0)

        #layers.append(embedding_layer)
        layers.append(output_layer)
        params = []
        for l in self.layers:
            params += l.params
        self.params = params
        say("num of parameters: {}\n".format(
            sum(len(x.get_value(borrow=True).ravel()) for x in params)))

        l2_reg = None
        for p in params:
            if l2_reg is None:
                l2_reg = p.norm(2)
            else:
                l2_reg = l2_reg + p.norm(2)
        l2_reg = l2_reg * args.l2_reg
        self.loss = T.mean(nll)
        self.cost = self.loss + l2_reg
Esempio n. 20
0
    def ready(self):
        generator = self.generator
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = generator.dropout

        # len*batch
        x = generator.x
        z = generator.z_pred
        z = z.dimshuffle((0,1,"x"))

        # batch*nclasses
        y = self.y = T.fmatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = [ ]
        depth = args.depth
        layer_type = args.layer.lower()
        for i in xrange(depth):
            if layer_type == "rcnn":
                l = ExtRCNN(
                        n_in = n_e if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation,
                        order = args.order
                    )
            elif layer_type == "lstm":
                l = ExtLSTM(
                        n_in = n_e if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation
                    )
            layers.append(l)

        # len * batch * 1
        masks = T.cast(T.neq(x, padding_id).dimshuffle((0,1,"x")) * z, theano.config.floatX)
        # batch * 1
        cnt_non_padding = T.sum(masks, axis=0) + 1e-8

        # len*batch*n_e
        embs = generator.word_embs

        pooling = args.pooling
        lst_states = [ ]
        h_prev = embs
        for l in layers:
            # len*batch*n_d
            h_next = l.forward_all(h_prev, z)
            if pooling:
                # batch * n_d
                masked_sum = T.sum(h_next * masks, axis=0)
                lst_states.append(masked_sum/cnt_non_padding) # mean pooling
            else:
                lst_states.append(h_next[-1]) # last state
            h_prev = apply_dropout(h_next, dropout)

        if args.use_all:
            size = depth * n_d
            # batch * size (i.e. n_d*depth)
            h_final = T.concatenate(lst_states, axis=1)
        else:
            size = n_d
            h_final = lst_states[-1]
        h_final = apply_dropout(h_final, dropout)

        output_layer = self.output_layer = Layer(
                n_in = size,
                n_out = self.nclasses,
                activation = sigmoid
            )

        # batch * nclasses
        preds = self.preds = output_layer.forward(h_final)

        # batch
        loss_mat = self.loss_mat = (preds-y)**2

        pred_diff = self.pred_diff = T.mean(T.max(preds, axis=1) - T.min(preds, axis=1))

        if args.aspect < 0:
            loss_vec = T.mean(loss_mat, axis=1)
        else:
            assert args.aspect < self.nclasses
            loss_vec = loss_mat[:,args.aspect]
        self.loss_vec = loss_vec

        zsum = generator.zsum
        zdiff = generator.zdiff
        logpz = generator.logpz

        coherent_factor = args.sparsity * args.coherent
        loss = self.loss = T.mean(loss_vec)
        sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \
                                             T.mean(zdiff) * coherent_factor
        cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor
        cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0))
        self.obj = T.mean(cost_vec)

        params = self.params = [ ]
        for l in layers + [ output_layer ]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost

        self.cost_g = cost_logpz * 10 + generator.l2_cost
        self.cost_e = loss * 10 + l2_cost
Esempio n. 21
0
    def ready(self):
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))

        # len*batch
        x = self.x = T.imatrix()

        z = self.z = T.bmatrix()
        z = z.dimshuffle((0, 1, "x"))

        # batch*nclasses
        y = self.y = T.fmatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = []
        depth = args.depth
        layer_type = args.layer.lower()
        for i in range(depth):
            if layer_type == "rcnn":
                l = ExtRCNN(n_in=n_e if i == 0 else n_d,
                            n_out=n_d,
                            activation=activation,
                            order=args.order)
            elif layer_type == "lstm":
                l = ExtLSTM(n_in=n_e if i == 0 else n_d,
                            n_out=n_d,
                            activation=activation)
            layers.append(l)

        # len * batch * 1
        masks = T.cast(
            T.neq(x, padding_id).dimshuffle((0, 1, "x")) * z,
            theano.config.floatX)
        # batch * 1
        cnt_non_padding = T.sum(masks, axis=0) + 1e-8

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)

        pooling = args.pooling
        lst_states = []
        h_prev = embs
        for l in layers:
            # len*batch*n_d
            h_next = l.forward_all(h_prev, z)
            if pooling:
                # batch * n_d
                masked_sum = T.sum(h_next * masks, axis=0)
                lst_states.append(masked_sum / cnt_non_padding)  # mean pooling
            else:
                lst_states.append(h_next[-1])  # last state
            h_prev = apply_dropout(h_next, dropout)

        if args.use_all:
            size = depth * n_d
            # batch * size (i.e. n_d*depth)
            h_final = T.concatenate(lst_states, axis=1)
        else:
            size = n_d
            h_final = lst_states[-1]
        h_final = apply_dropout(h_final, dropout)

        output_layer = self.output_layer = Layer(n_in=size,
                                                 n_out=self.nclasses,
                                                 activation=sigmoid)

        # batch * nclasses
        preds = self.preds = output_layer.forward(h_final)

        # batch
        loss_mat = self.loss_mat = (preds - y)**2
        loss = self.loss = T.mean(loss_mat)

        pred_diff = self.pred_diff = T.mean(
            T.max(preds, axis=1) - T.min(preds, axis=1))

        params = self.params = []
        for l in layers + [output_layer]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost

        cost = self.cost = loss * 10 + l2_cost
Esempio n. 22
0
    def ready(self):
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = theano.shared(
                np.float64(args.dropout).astype(theano.config.floatX)
            )

        # len*batch
        x = self.x = T.imatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = [ ]
        layer_type = args.layer.lower()
        for i in xrange(2):
            if layer_type == "rcnn":
                l = RCNN(
                        n_in = n_e,
                        n_out = n_d,
                        activation = activation,
                        order = args.order
                    )
            elif layer_type == "lstm":
                l = LSTM(
                        n_in = n_e,
                        n_out = n_d,
                        activation = activation
                    )
            layers.append(l)

        # len * batch
        masks = T.cast(T.neq(x, padding_id), theano.config.floatX)

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)
        self.word_embs = embs

        flipped_embs = embs[::-1]

        # len*bacth*n_d
        h1 = layers[0].forward_all(embs)
        h2 = layers[1].forward_all(flipped_embs)
        h_final = T.concatenate([h1, h2[::-1]], axis=2)
        h_final = apply_dropout(h_final, dropout)
        size = n_d * 2

        output_layer = self.output_layer = ZLayer(
                n_in = size,
                n_hidden = args.hidden_dimension2,
                activation = activation
            )

        # sample z given text (i.e. x)
        z_pred, sample_updates = output_layer.sample_all(h_final)

        # we are computing approximated gradient by sampling z;
        # so should mark sampled z not part of the gradient propagation path
        #
        z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred)
        self.sample_updates = sample_updates
        print "z_pred", z_pred.ndim

        probs = output_layer.forward_all(h_final, z_pred)
        print "probs", probs.ndim

        logpz = - T.nnet.binary_crossentropy(probs, z_pred) * masks
        logpz = self.logpz = logpz.reshape(x.shape)
        probs = self.probs = probs.reshape(x.shape)

        # batch
        z = z_pred
        self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
        self.zdiff = T.sum(T.abs_(z[1:]-z[:-1]), axis=0, dtype=theano.config.floatX)

        params = self.params = [ ]
        for l in layers + [ output_layer ]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost
Esempio n. 23
0
    def ready(self):
        generator = self.generator
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]
        unk_id = embedding_layer.vocab_map["<unk>"]
        unk_vec = embedding_layer.embeddings[unk_id]

        dropout = generator.dropout

        # len*batch
        x = generator.x
        z = generator.z_pred
        z = z.dimshuffle((0,1,"x"))

        # batch*nclasses
        y = self.y = T.fmatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = [ ]
        depth = args.depth
        layer_type = args.layer.lower()
        for i in xrange(depth):
            l = CNN(
                    n_in = n_e if i == 0 else n_d,
                    n_out = n_d,
                    activation = activation,
                    order = args.order
                )
            layers.append(l)

        # len * batch * 1
        masks = T.cast(T.neq(x, padding_id).dimshuffle((0,1,"x")) * z, theano.config.floatX)
        # batch * 1
        cnt_non_padding = T.sum(masks, axis=0) + 1e-8

        # len*batch*n_e
        embs = generator.word_embs*z + unk_vec.dimshuffle(('x','x',0))*(1-z)

        pooling = args.pooling
        lst_states = [ ]
        h_prev = embs
        for l in layers:
            # len*batch*n_d
            h_next = l.forward_all(h_prev)
            if pooling:
                # batch * n_d
                masked_sum = T.sum(h_next * masks, axis=0)
                lst_states.append(masked_sum/cnt_non_padding) # mean pooling
            else:
                lst_states.append(T.max(h_next, axis=0))
            h_prev = apply_dropout(h_next, dropout)

        if args.use_all:
            size = depth * n_d
            # batch * size (i.e. n_d*depth)
            h_final = T.concatenate(lst_states, axis=1)
        else:
            size = n_d
            h_final = lst_states[-1]
        h_final = apply_dropout(h_final, dropout)

        output_layer = self.output_layer = Layer(
                n_in = size,
                n_out = self.nclasses,
                activation = sigmoid
            )

        # batch * nclasses
        p_y_given_x = self.p_y_given_x = output_layer.forward(h_final)
        preds = self.preds = p_y_given_x > 0.5
        print preds, preds.dtype
        print self.nclasses

        # batch
        loss_mat = T.nnet.binary_crossentropy(p_y_given_x, y)

        if args.aspect < 0:
            loss_vec = T.mean(loss_mat, axis=1)
        else:
            assert args.aspect < self.nclasses
            loss_vec = loss_mat[:,args.aspect]
        self.loss_vec = loss_vec

        self.true_pos = T.sum(preds*y)
        self.tot_pos = T.sum(preds)
        self.tot_true = T.sum(y)

        zsum = generator.zsum
        zdiff = generator.zdiff
        logpz = generator.logpz

        coherent_factor = args.sparsity * args.coherent
        loss = self.loss = T.mean(loss_vec)
        sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \
                                             T.mean(zdiff) * coherent_factor
        cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor
        cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0))
        self.obj = T.mean(cost_vec)

        params = self.params = [ ]
        for l in layers + [ output_layer ]:
            for p in l.params:
                params.append(p)
        if not args.fix_emb:
            params += embedding_layer.params
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost

        self.cost_g = cost_logpz + generator.l2_cost
        self.cost_e = loss + l2_cost
Esempio n. 24
0
    def ready(self):
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))

        # len*batch
        x = self.x = T.imatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = []
        layer_type = args.layer.lower()
        for i in xrange(2):
            if layer_type == "rcnn":
                l = RCNN(n_in=n_e,
                         n_out=n_d,
                         activation=activation,
                         order=args.order)
            elif layer_type == "lstm":
                l = LSTM(n_in=n_e, n_out=n_d, activation=activation)
            layers.append(l)

        # len * batch
        masks = T.cast(T.neq(x, padding_id), theano.config.floatX)

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)
        self.word_embs = embs

        flipped_embs = embs[::-1]

        # len*bacth*n_d
        h1 = layers[0].forward_all(embs)
        h2 = layers[1].forward_all(flipped_embs)
        h_final = T.concatenate([h1, h2[::-1]], axis=2)
        h_final = apply_dropout(h_final, dropout)
        size = n_d * 2

        output_layer = self.output_layer = ZLayer(
            n_in=size, n_hidden=args.hidden_dimension2, activation=activation)

        # sample z given text (i.e. x)
        z_pred, sample_updates = output_layer.sample_all(h_final)

        # we are computing approximated gradient by sampling z;
        # so should mark sampled z not part of the gradient propagation path
        #
        z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred)
        self.sample_updates = sample_updates
        print "z_pred", z_pred.ndim

        probs = output_layer.forward_all(h_final, z_pred)
        print "probs", probs.ndim

        logpz = -T.nnet.binary_crossentropy(probs, z_pred) * masks
        logpz = self.logpz = logpz.reshape(x.shape)
        probs = self.probs = probs.reshape(x.shape)

        # batch
        z = z_pred
        self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
        self.zdiff = T.sum(T.abs_(z[1:] - z[:-1]),
                           axis=0,
                           dtype=theano.config.floatX)

        params = self.params = []
        for l in layers + [output_layer]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost
Esempio n. 25
0
    def ready(self):
        global total_encode_time
        #say("in encoder ready: \n")
        #start_encode_time = time.time()
        generator = self.generator
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = generator.dropout

        # len*batch
        x = generator.x
        z = generator.z_pred
        z = z.dimshuffle((0, 1, "x"))

        # batch*nclasses
        y = self.y = T.fmatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = []
        depth = args.depth
        layer_type = args.layer.lower()
        for i in xrange(depth):
            if layer_type == "rcnn":
                l = ExtRCNN(n_in=n_e if i == 0 else n_d,
                            n_out=n_d,
                            activation=activation,
                            order=args.order)
            elif layer_type == "lstm":
                l = ExtLSTM(n_in=n_e if i == 0 else n_d,
                            n_out=n_d,
                            activation=activation)
            layers.append(l)

        # len * batch * 1
        masks = T.cast(
            T.neq(x, padding_id).dimshuffle((0, 1, "x")) * z,
            theano.config.floatX)
        # batch * 1
        cnt_non_padding = T.sum(masks, axis=0) + 1e-8

        # len*batch*n_e
        embs = generator.word_embs

        pooling = args.pooling
        lst_states = []
        h_prev = embs
        for l in layers:
            # len*batch*n_d
            h_next = l.forward_all(h_prev, z)
            if pooling:
                # batch * n_d
                masked_sum = T.sum(h_next * masks, axis=0)
                lst_states.append(masked_sum / cnt_non_padding)  # mean pooling
            else:
                lst_states.append(h_next[-1])  # last state
            h_prev = apply_dropout(h_next, dropout)

        if args.use_all:
            size = depth * n_d
            # batch * size (i.e. n_d*depth)
            h_final = T.concatenate(lst_states, axis=1)
        else:
            size = n_d
            h_final = lst_states[-1]
        h_final = apply_dropout(h_final, dropout)

        output_layer = self.output_layer = Layer(n_in=size,
                                                 n_out=self.nclasses,
                                                 activation=sigmoid)

        # batch * nclasses
        preds = self.preds = output_layer.forward(h_final)

        # batch
        loss_mat = self.loss_mat = (preds - y)**2

        pred_diff = self.pred_diff = T.mean(
            T.max(preds, axis=1) - T.min(preds, axis=1))

        if args.aspect < 0:
            loss_vec = T.mean(loss_mat, axis=1)
        else:
            assert args.aspect < self.nclasses
            loss_vec = loss_mat[:, args.aspect]
        self.loss_vec = loss_vec

        zsum = generator.zsum
        zdiff = generator.zdiff
        logpz = generator.logpz

        coherent_factor = args.sparsity * args.coherent
        loss = self.loss = T.mean(loss_vec)
        sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \
                                             T.mean(zdiff) * coherent_factor
        cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor
        cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0))
        self.obj = T.mean(cost_vec)

        params = self.params = []
        for l in layers + [output_layer]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost

        self.cost_g = cost_logpz * 10 + generator.l2_cost
        self.cost_e = loss * 10 + l2_cost
Esempio n. 26
0
    def ready(self, args, train):
        # len * batch
        self.idxs = T.imatrix()
        self.idys = T.imatrix()
        self.init_state = T.matrix(dtype=theano.config.floatX)

        dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX)
        self.dropout = theano.shared(dropout_prob)

        self.n_d = args["hidden_dim"]

        embedding_layer = EmbeddingLayer(
                n_d = self.n_d,
                vocab = set(w for w in train)
            )
        self.n_V = embedding_layer.n_V

        say("Vocab size: {}\tHidden dim: {}\n".format(
                self.n_V, self.n_d
            ))

        activation = get_activation_by_name(args["activation"])

        rnn_layer = LSTM(
                 n_in = self.n_d,
                 n_out = self.n_d,
                 activation = activation
            )

        output_layer = Layer(
                n_in = self.n_d,
                n_out = self.n_V,
                activation = T.nnet.softmax,
            )

        # (len*batch) * n_d
        x_flat = embedding_layer.forward(self.idxs.ravel())

        # len * batch * n_d
        x = apply_dropout(x_flat, self.dropout)
        x = x.reshape( (self.idxs.shape[0], self.idxs.shape[1], self.n_d) )

        # len * batch * (n_d+n_d)
        h = rnn_layer.forward_all(x, self.init_state, return_c=True)

        self.last_state = h[-1]
        h = h[:,:,self.n_d:]
        h = apply_dropout(h, self.dropout)

        self.p_y_given_x = output_layer.forward(h.reshape(x_flat.shape))

        idys = self.idys.ravel()
        self.nll = -T.log(self.p_y_given_x[T.arange(idys.shape[0]), idys])
        #self.nll = T.nnet.categorical_crossentropy(
        #                self.p_y_given_x,
        #                idys
        #            )

        self.layers = [ embedding_layer, rnn_layer, output_layer ]
        #self.params = [ x_flat ] + rnn_layer.params + output_layer.params
        self.params = embedding_layer.params + rnn_layer.params + output_layer.params
        self.num_params = sum(len(x.get_value(borrow=True).ravel())
                                for l in self.layers for x in l.params)
        say("# of params in total: {}\n".format(self.num_params))
Esempio n. 27
0
    def ready(self):
        args = self.args
        weights = self.weights

        # len(title) * batch
        idts = self.idts = T.imatrix()

        # len(body) * batch
        idbs = self.idbs = T.imatrix()

        # num pairs * 3, or num queries * candidate size
        idps = self.idps = T.imatrix()

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))
        dropout_op = self.dropout_op = Dropout(self.dropout)

        embedding_layer = self.embedding_layer

        activation = get_activation_by_name(args.activation)
        n_d = self.n_d = args.hidden_dim
        n_e = self.n_e = embedding_layer.n_d

        if args.layer.lower() == "rcnn":
            LayerType = RCNN
        elif args.layer.lower() == "lstm":
            LayerType = LSTM
        elif args.layer.lower() == "gru":
            LayerType = GRU

        depth = self.depth = args.depth
        layers = self.layers = []
        for i in range(depth):
            if LayerType != RCNN:
                feature_layer = LayerType(n_in=n_e if i == 0 else n_d,
                                          n_out=n_d,
                                          activation=activation)
            else:
                feature_layer = LayerType(n_in=n_e if i == 0 else n_d,
                                          n_out=n_d,
                                          activation=activation,
                                          order=args.order,
                                          mode=args.mode,
                                          has_outgate=args.outgate)
            layers.append(feature_layer)

        # feature computation starts here

        # (len*batch)*n_e
        xt = embedding_layer.forward(idts.ravel())
        if weights is not None:
            xt_w = weights[idts.ravel()].dimshuffle((0, 'x'))
            xt = xt * xt_w

        # len*batch*n_e
        xt = xt.reshape((idts.shape[0], idts.shape[1], n_e))
        xt = apply_dropout(xt, dropout)

        # (len*batch)*n_e
        xb = embedding_layer.forward(idbs.ravel())
        if weights is not None:
            xb_w = weights[idbs.ravel()].dimshuffle((0, 'x'))
            xb = xb * xb_w

        # len*batch*n_e
        xb = xb.reshape((idbs.shape[0], idbs.shape[1], n_e))
        xb = apply_dropout(xb, dropout)

        prev_ht = self.xt = xt
        prev_hb = self.xb = xb
        for i in range(depth):
            # len*batch*n_d
            ht = layers[i].forward_all(prev_ht)
            hb = layers[i].forward_all(prev_hb)
            prev_ht = ht
            prev_hb = hb

        # normalize vectors
        if args.normalize:
            ht = self.normalize_3d(ht)
            hb = self.normalize_3d(hb)
            say("h_title dtype: {}\n".format(ht.dtype))

        self.ht = ht
        self.hb = hb

        # average over length, ignore paddings
        # batch * d
        if args.average:
            ht = self.average_without_padding(ht, idts)
            hb = self.average_without_padding(hb, idbs)
        else:
            ht = ht[-1]
            hb = hb[-1]
        say("h_avg_title dtype: {}\n".format(ht.dtype))

        # batch * d
        h_final = (ht + hb) * 0.5
        h_final = apply_dropout(h_final, dropout)
        h_final = self.normalize_2d(h_final)
        self.h_final = h_final
        say("h_final dtype: {}\n".format(ht.dtype))

        # For testing:
        #   first one in batch is query, the rest are candidate questions
        self.scores = T.dot(h_final[1:], h_final[0])

        # For training:
        xp = h_final[idps.ravel()]
        xp = xp.reshape((idps.shape[0], idps.shape[1], n_d))
        # num query * n_d
        query_vecs = xp[:, 0, :]
        # num query
        pos_scores = T.sum(query_vecs * xp[:, 1, :], axis=1)
        # num query * candidate size
        neg_scores = T.sum(query_vecs.dimshuffle((0, 'x', 1)) * xp[:, 2:, :],
                           axis=2)
        # num query
        neg_scores = T.max(neg_scores, axis=1)
        diff = neg_scores - pos_scores + 1.0
        loss = T.mean((diff > 0) * diff)
        self.loss = loss

        params = []
        for l in self.layers:
            params += l.params
        self.params = params
        say("num of parameters: {}\n".format(
            sum(len(x.get_value(borrow=True).ravel()) for x in params)))

        l2_reg = None
        for p in params:
            if l2_reg is None:
                l2_reg = p.norm(2)
            else:
                l2_reg = l2_reg + p.norm(2)
        l2_reg = l2_reg * args.l2_reg
        self.cost = self.loss + l2_reg
Esempio n. 28
0
    def ready(self):
        generator = self.generator
        args = self.args
        weights = self.weights

        dropout = generator.dropout

        # len(text) * batch
        idts = generator.x
        z = generator.z_pred
        z = z.dimshuffle((0,1,"x"))

        # batch * 2
        pairs = self.pairs = T.imatrix()

        # num pairs * 3, or num queries * candidate size
        triples = self.triples = T.imatrix()

        embedding_layer = self.embedding_layer

        activation = get_activation_by_name(args.activation)
        n_d = self.n_d = args.hidden_dim
        n_e = self.n_e = embedding_layer.n_d

        if args.layer.lower() == "rcnn":
            LayerType = RCNN
            LayerType2 = ExtRCNN
        elif args.layer.lower() == "lstm":
            LayerType = LSTM
            LayerType2 = ExtLSTM
        #elif args.layer.lower() == "gru":
        #    LayerType = GRU

        depth = self.depth = args.depth
        layers = self.layers = [ ]
        for i in range(depth):
            if LayerType != RCNN:
                feature_layer = LayerType(
                        n_in = n_e if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation
                    )
            else:
                feature_layer = LayerType(
                        n_in = n_e if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation,
                        order = args.order,
                        mode = args.mode,
                        has_outgate = args.outgate
                    )
            layers.append(feature_layer)

        extlayers = self.extlayers = [ ]
        for i in range(depth):
            if LayerType != RCNN:
                feature_layer = LayerType2(
                        n_in = n_e if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation
                    )
            else:
                feature_layer = LayerType2(
                        n_in = n_e if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation,
                        order = args.order,
                        mode = args.mode,
                        has_outgate = args.outgate
                    )
            feature_layer.copy_params(layers[i])
            extlayers.append(feature_layer)


        # feature computation starts here

        xt = generator.word_embs

        # encode full text into representation
        prev_ht = self.xt = xt
        for i in range(depth):
            # len*batch*n_d
            ht = layers[i].forward_all(prev_ht)
            prev_ht = ht

        # encode selected text into representation
        prev_htz = self.xt = xt
        for i in range(depth):
            # len*batch*n_d
            htz = extlayers[i].forward_all(prev_htz, z)
            prev_htz = htz

        # normalize vectors
        if args.normalize:
            ht = self.normalize_3d(ht)
            htz = self.normalize_3d(htz)
            say("h_title dtype: {}\n".format(ht.dtype))

        self.ht = ht
        self.htz = htz

        # average over length, ignore paddings
        # batch * d
        if args.average:
            ht = self.average_without_padding(ht, idts)
            htz = self.average_without_padding(htz, idts, z)
        else:
            ht = ht[-1]
            htz = htz[-1]
        say("h_avg_title dtype: {}\n".format(ht.dtype))

        # batch * d
        h_final = apply_dropout(ht, dropout)
        h_final = self.normalize_2d(h_final)
        hz_final = apply_dropout(htz, dropout)
        hz_final = self.normalize_2d(hz_final)
        self.h_final = h_final
        self.hz_final = hz_final

        say("h_final dtype: {}\n".format(ht.dtype))

        # For testing:
        #   first one in batch is query, the rest are candidate questions
        self.scores = T.dot(h_final[1:], h_final[0])
        self.scores_z = T.dot(hz_final[1:], hz_final[0])

        # For training encoder:
        xp = h_final[triples.ravel()]
        xp = xp.reshape((triples.shape[0], triples.shape[1], n_d))
        # num query * n_d
        query_vecs = xp[:,0,:]
        # num query
        pos_scores = T.sum(query_vecs*xp[:,1,:], axis=1)
        # num query * candidate size
        neg_scores = T.sum(query_vecs.dimshuffle((0,'x',1))*xp[:,2:,:], axis=2)
        # num query
        neg_scores = T.max(neg_scores, axis=1)
        diff = neg_scores - pos_scores + 1.0
        hinge_loss = T.mean( (diff>0)*diff )

        # For training generator

        # batch
        self_cosine_distance = 1.0 - T.sum(hz_final * h_final, axis=1)
        pair_cosine_distance = 1.0 - T.sum(hz_final * h_final[pairs[:,1]], axis=1)
        alpha = args.alpha
        loss_vec = self_cosine_distance*alpha + pair_cosine_distance*(1-alpha)
        #loss_vec = self_cosine_distance*0.2 + pair_cosine_distance*0.8

        zsum = generator.zsum
        zdiff = generator.zdiff
        logpz = generator.logpz

        sfactor = args.sparsity
        cfactor = args.sparsity * args.coherent
        scost_vec = zsum*sfactor + zdiff*cfactor

        # batch
        cost_vec = loss_vec + scost_vec
        cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0))
        loss = self.loss = T.mean(loss_vec)
        sparsity_cost = self.sparsity_cost = T.mean(scost_vec)
        self.obj =  loss + sparsity_cost

        params = [ ]
        for l in self.layers:
            params += l.params
        self.params = params
        say("num of parameters: {}\n".format(
            sum(len(x.get_value(borrow=True).ravel()) for x in params)
        ))

        l2_reg = None
        for p in params:
            if l2_reg is None:
                l2_reg = T.sum(p**2) #p.norm(2)
            else:
                l2_reg = l2_reg + T.sum(p**2) #p.norm(2)
        l2_reg = l2_reg * args.l2_reg
        self.l2_cost = l2_reg

        beta = args.beta
        self.cost_g = cost_logpz + generator.l2_cost
        self.cost_e = hinge_loss + loss*beta + l2_reg
        print "cost dtype", self.cost_g.dtype, self.cost_e.dtype