Ejemplo n.º 1
0
    def ready(self, args, train):
        # len * batch
        self.idxs = T.imatrix()
        self.idys = T.imatrix()
        self.init_state = T.matrix(dtype=theano.config.floatX)

        dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX)
        self.dropout = theano.shared(dropout_prob)

        self.n_d = args["hidden_dim"]

        embedding_layer = EmbeddingLayer(n_d=self.n_d,
                                         vocab=set(w for w in train))
        self.n_V = embedding_layer.n_V

        say("Vocab size: {}\tHidden dim: {}\n".format(self.n_V, self.n_d))

        activation = get_activation_by_name(args["activation"])

        rnn_layer = LSTM(n_in=self.n_d, n_out=self.n_d, activation=activation)

        output_layer = Layer(
            n_in=self.n_d,
            n_out=self.n_V,
            activation=T.nnet.softmax,
        )

        # (len*batch) * n_d
        x_flat = embedding_layer.forward(self.idxs.ravel())

        # len * batch * n_d
        x = apply_dropout(x_flat, self.dropout)
        x = x.reshape((self.idxs.shape[0], self.idxs.shape[1], self.n_d))

        # len * batch * (n_d+n_d)
        h = rnn_layer.forward_all(x, self.init_state, return_c=True)

        self.last_state = h[-1]
        h = h[:, :, self.n_d:]
        h = apply_dropout(h, self.dropout)

        self.p_y_given_x = output_layer.forward(h.reshape(x_flat.shape))

        idys = self.idys.ravel()
        self.nll = -T.log(self.p_y_given_x[T.arange(idys.shape[0]), idys])
        #self.nll = T.nnet.categorical_crossentropy(
        #                self.p_y_given_x,
        #                idys
        #            )

        self.layers = [embedding_layer, rnn_layer, output_layer]
        #self.params = [ x_flat ] + rnn_layer.params + output_layer.params
        self.params = embedding_layer.params + rnn_layer.params + output_layer.params
        self.num_params = sum(
            len(x.get_value(borrow=True).ravel()) for l in self.layers
            for x in l.params)
        say("# of params in total: {}\n".format(self.num_params))
Ejemplo n.º 2
0
    def ready(self):
        args = self.args
        index = self.index = T.lscalar()
        x = self.x = T.fmatrix()
        y = self.y = T.ivector()

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype("float32"))

        n_d = args.hidden_dim
        layers = self.layers = []
        for i in xrange(args.depth):
            l = Layer(n_in=28 * 28 if i == 0 else n_d,
                      n_out=n_d,
                      activation=ReLU)
            layers.append(l)

        output_layer = self.output_layer = Layer(n_in=n_d,
                                                 n_out=10,
                                                 activation=softmax)

        h = x
        for l in layers:
            h = l.forward(h)
            h = apply_dropout(h, dropout)

        self.h_final = h

        # batch * 10
        probs = self.probs = output_layer.forward(h)

        # batch
        preds = self.preds = T.argmax(probs, axis=1)
        err = self.err = T.mean(T.cast(T.neq(preds, y), dtype="float32"))

        #
        loss = self.loss = -T.mean(T.log(probs[T.arange(y.shape[0]), y]))
        #loss = self.loss = T.mean( T.nnet.categorical_crossentropy(
        #                            probs,
        #                            y
        #                    ))

        params = self.params = []
        for l in layers + [output_layer]:
            for p in l.params:
                params.append(p)

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost += T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg

        self.l2_cost = l2_cost
        self.cost = loss + l2_cost
        print "cost.dtype", self.cost.dtype
Ejemplo n.º 3
0
    def build_model(self):
        args = self.args
        weights = self.weights

        meta_emb = self.meta_emb = self.embs[0]
        golden_embs = self.embs[1:]

        n_m_d = meta_emb.n_d
        dropout = self.dropout = theano.shared(
            np.float64(args.dropout_rate).astype(theano.config.floatX))

        batch_ids = self.batch_ids = T.ivector('batch_d_char')
        batch_masks = self.batch_masks = T.fmatrix('batch_d_char_mask')

        layers = self.layers = [meta_emb]

        slices_embs = meta_emb.forward(batch_ids.ravel())
        slices_embs = slices_embs.reshape((batch_ids.shape[0], n_m_d))
        prev_output = apply_dropout(slices_embs, dropout, v2=True)

        self.all_loss = 0.0
        for i in range(len(weights)):
            mask, weight, golden_emb = batch_masks[i], weights[i], golden_embs[
                i]
            n_o_d = golden_emb.n_d
            layer = Layer(n_m_d, n_o_d, linear)
            layers.append(layer)
            mapped_output = layer.forward(prev_output)

            slices_embs = golden_emb.forward(batch_ids.ravel())
            slices_embs = slices_embs.reshape((batch_ids.shape[0], n_o_d))
            self.all_loss += weight * T.sum(
                T.sum((mapped_output - slices_embs) *
                      (mapped_output - slices_embs),
                      axis=1) * mask) / (1e-8 + T.sum(mask))

        for i, l in enumerate(layers[1:]):
            say("layer {}: n_in={}\tn_out={}\n".format(i, l.n_in, l.n_out))

        self.l2_sqr = None
        self.params = []

        for layer in layers:
            self.params += layer.params
        for p in self.params:
            if self.l2_sqr is None:
                self.l2_sqr = args.l2_reg * T.sum(p**2)
            else:
                self.l2_sqr += args.l2_reg * T.sum(p**2)

        self.all_loss += self.l2_sqr
        n_params = sum(
            len(x.get_value(borrow=True).ravel()) for x in self.params)
        say("total # parameters: {}\n".format(n_params))
Ejemplo n.º 4
0
 def get_recon_loss(self, idxs, sent_output):
     len_sent, len_doc_batch, n_d = sent_output.shape
     recon_layer = self.recon_layer
     padding_id = self.padding_id
     dropout = self.dropout
     
     # (len(sent)*len(doc)*batch)*n_e
     input_flat = idxs.ravel()
     true_recon = self.embedding_layer.recon_forward(input_flat)
     sent_output = apply_dropout(sent_output, dropout)
     pred_recon = recon_layer.forward(sent_output.reshape((len_sent*len_doc_batch, n_d)))
     
     # (len(sent)*len(doc)*batch)
     mask = T.cast(T.neq(input_flat, padding_id), theano.config.floatX)
     n = T.sum(mask)
     loss = T.sum((true_recon - pred_recon) ** 2, axis=1) * mask
     loss = T.sum(loss) / n
     return loss
Ejemplo n.º 5
0
    def ready(self):
        args = self.args
        weights = self.weights

        # len(title) * batch
        idts = self.idts = T.imatrix()

        # len(body) * batch
        idbs = self.idbs = T.imatrix()

        # num pairs * 3, or num queries * candidate size
        idps = self.idps = T.imatrix()

        dropout = self.dropout = theano.shared(np.float64(args.dropout).astype(
                            theano.config.floatX))
        dropout_op = self.dropout_op = Dropout(self.dropout)

        embedding_layer = self.embedding_layer

        activation = get_activation_by_name(args.activation)
        n_d = self.n_d = args.hidden_dim
        n_e = self.n_e = embedding_layer.n_d

        if args.layer.lower() == "rcnn":
            LayerType = RCNN
        elif args.layer.lower() == "lstm":
            LayerType = LSTM
        elif args.layer.lower() == "gru":
            LayerType = GRU

        depth = self.depth = args.depth
        layers = self.layers = [ ]
        for i in range(depth):
            if LayerType != RCNN:
                feature_layer = LayerType(
                        n_in = n_e if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation
                    )
            else:
                feature_layer = LayerType(
                        n_in = n_e if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation,
                        order = args.order,
                        mode = args.mode,
                        has_outgate = args.outgate
                    )
            layers.append(feature_layer)

        # feature computation starts here

        # (len*batch)*n_e
        xt = embedding_layer.forward(idts.ravel())
        if weights is not None:
            xt_w = weights[idts.ravel()].dimshuffle((0,'x'))
            xt = xt * xt_w

        # len*batch*n_e
        xt = xt.reshape((idts.shape[0], idts.shape[1], n_e))
        xt = apply_dropout(xt, dropout)

        # (len*batch)*n_e
        xb = embedding_layer.forward(idbs.ravel())
        if weights is not None:
            xb_w = weights[idbs.ravel()].dimshuffle((0,'x'))
            xb = xb * xb_w

        # len*batch*n_e
        xb = xb.reshape((idbs.shape[0], idbs.shape[1], n_e))
        xb = apply_dropout(xb, dropout)

        prev_ht = self.xt = xt
        prev_hb = self.xb = xb
        for i in range(depth):
            # len*batch*n_d
            ht = layers[i].forward_all(prev_ht)
            hb = layers[i].forward_all(prev_hb)
            prev_ht = ht
            prev_hb = hb

        # normalize vectors
        if args.normalize:
            ht = self.normalize_3d(ht)
            hb = self.normalize_3d(hb)
            say("h_title dtype: {}\n".format(ht.dtype))

        self.ht = ht
        self.hb = hb

        # average over length, ignore paddings
        # batch * d
        if args.average:
            ht = self.average_without_padding(ht, idts)
            hb = self.average_without_padding(hb, idbs)
        else:
            ht = ht[-1]
            hb = hb[-1]
        say("h_avg_title dtype: {}\n".format(ht.dtype))

        # batch * d
        h_final = (ht+hb)*0.5
        h_final = apply_dropout(h_final, dropout)
        h_final = self.normalize_2d(h_final)
        self.h_final = h_final
        say("h_final dtype: {}\n".format(ht.dtype))

        # For testing:
        #   first one in batch is query, the rest are candidate questions
        self.scores = T.dot(h_final[1:], h_final[0])

        # For training:
        xp = h_final[idps.ravel()]
        xp = xp.reshape((idps.shape[0], idps.shape[1], n_d))
        # num query * n_d
        query_vecs = xp[:,0,:]
        # num query
        pos_scores = T.sum(query_vecs*xp[:,1,:], axis=1)
        # num query * candidate size
        neg_scores = T.sum(query_vecs.dimshuffle((0,'x',1))*xp[:,2:,:], axis=2)
        # num query
        neg_scores = T.max(neg_scores, axis=1)
        diff = neg_scores - pos_scores + 1.0
        loss = T.mean( (diff>0)*diff )
        self.loss = loss

        params = [ ]
        for l in self.layers:
            params += l.params
        self.params = params
        say("num of parameters: {}\n".format(
            sum(len(x.get_value(borrow=True).ravel()) for x in params)
        ))

        l2_reg = None
        for p in params:
            if l2_reg is None:
                l2_reg = p.norm(2)
            else:
                l2_reg = l2_reg + p.norm(2)
        l2_reg = l2_reg * args.l2_reg
        self.cost = self.loss + l2_reg
Ejemplo n.º 6
0
    def ready(self):
        args = self.args
        weights = self.weights

        # len(source) * batch
        idxs = self.idxs = T.imatrix()

        # len(target) * batch
        idys = self.idys = T.imatrix()
        idts = idys[:-1]
        idgs = idys[1:]

        dropout = self.dropout = theano.shared(np.float64(args.dropout).astype(
                            theano.config.floatX))

        embedding_layer = self.embedding_layer

        activation = get_activation_by_name(args.activation)
        n_d = self.n_d = args.hidden_dim
        n_e = self.n_e = embedding_layer.n_d
        n_V = self.n_V = embedding_layer.n_V

        if args.layer.lower() == "rcnn":
            LayerType = RCNN
        elif args.layer.lower() == "lstm":
            LayerType = LSTM
        elif args.layer.lower() == "gru":
            LayerType = GRU

        depth = self.depth = args.depth
        layers = self.layers = [ ]
        for i in range(depth*2):
            if LayerType != RCNN:
                feature_layer = LayerType(
                        n_in = n_e if i/2 == 0 else n_d,
                        n_out = n_d,
                        activation = activation
                    )
            else:
                feature_layer = LayerType(
                        n_in = n_e if i/2 == 0 else n_d,
                        n_out = n_d,
                        activation = activation,
                        order = args.order,
                        mode = args.mode,
                        has_outgate = args.outgate
                    )
            layers.append(feature_layer)

        self.output_layer = output_layer = Layer(
                n_in = n_d,
                n_out = n_V,
                activation = T.nnet.softmax,
            )

        # feature computation starts here

        # (len*batch)*n_e
        xs_flat = embedding_layer.forward(idxs.ravel())
        xs_flat = apply_dropout(xs_flat, dropout)
        if weights is not None:
            xs_w = weights[idxs.ravel()].dimshuffle((0,'x'))
            xs_flat = xs_flat * xs_w
        # len*batch*n_e
        xs = xs_flat.reshape((idxs.shape[0], idxs.shape[1], n_e))

        # (len*batch)*n_e
        xt_flat = embedding_layer.forward(idts.ravel())
        xt_flat = apply_dropout(xt_flat, dropout)
        if weights is not None:
            xt_w = weights[idts.ravel()].dimshuffle((0,'x'))
            xt_flat = xt_flat * xt_w
        # len*batch*n_e
        xt = xt_flat.reshape((idts.shape[0], idts.shape[1], n_e))

        prev_hs = xs
        prev_ht = xt
        for i in range(depth):
            # len*batch*n_d
            hs = layers[i*2].forward_all(prev_hs, return_c=True)
            ht = layers[i*2+1].forward_all(prev_ht, hs[-1])
            hs = hs[:,:,-n_d:]
            ht = ht[:,:,-n_d:]
            prev_hs = hs
            prev_ht = ht
            prev_hs = apply_dropout(hs, dropout)
            prev_ht = apply_dropout(ht, dropout)

        self.p_y_given_x = output_layer.forward(prev_ht.reshape(
                                (xt_flat.shape[0], n_d)
                            ))

        h_final = hs[-1]
        self.scores2 = -(h_final[1:]-h_final[0]).norm(2,axis=1)
        h_final = self.normalize_2d(h_final)
        self.scores = T.dot(h_final[1:], h_final[0])

        # (len*batch)
        nll = T.nnet.categorical_crossentropy(
                        self.p_y_given_x,
                        idgs.ravel()
                    )
        nll = nll.reshape(idgs.shape)
        self.nll = nll
        self.mask = mask = T.cast(T.neq(idgs, self.padding_id), theano.config.floatX)
        nll = T.sum(nll*mask, axis=0)

        #layers.append(embedding_layer)
        layers.append(output_layer)
        params = [ ]
        for l in self.layers:
            params += l.params
        self.params = params
        say("num of parameters: {}\n".format(
            sum(len(x.get_value(borrow=True).ravel()) for x in params)
        ))

        l2_reg = None
        for p in params:
            if l2_reg is None:
                l2_reg = p.norm(2)
            else:
                l2_reg = l2_reg + p.norm(2)
        l2_reg = l2_reg * args.l2_reg
        self.loss = T.mean(nll)
        self.cost = self.loss + l2_reg
Ejemplo n.º 7
0
    def ready(self):
        args = self.args
        embedding_layer = self.embedding_layer
        self.n_hidden = args.hidden_dim
        self.n_in = embedding_layer.n_d
        dropout = self.dropout = theano.shared(
                np.float64(args.dropout_rate).astype(theano.config.floatX)
            )

        # x is length * batch_size
        # y is batch_size
        self.x = T.imatrix('x')
        self.y = T.ivector('y')

        x = self.x
        y = self.y
        n_hidden = self.n_hidden
        n_in = self.n_in

        # fetch word embeddings
        # (len * batch_size) * n_in
        slices  = embedding_layer.forward(x.ravel())
        self.slices = slices

        # 3-d tensor, len * batch_size * n_in
        slices = slices.reshape( (x.shape[0], x.shape[1], n_in) )

        # stacking the feature extraction layers
        pooling = args.pooling
        depth = args.depth
        layers = self.layers = [ ]
        prev_output = slices
        prev_output = apply_dropout(prev_output, dropout, v2=True)
        size = 0
        softmax_inputs = [ ]
        activation = get_activation_by_name(args.act)
        for i in range(depth):
            if args.layer.lower() == "lstm":
                layer = LSTM(
                            n_in = n_hidden if i > 0 else n_in,
                            n_out = n_hidden
                        )
            elif args.layer.lower() == "strcnn":
                layer = StrCNN(
                            n_in = n_hidden if i > 0 else n_in,
                            n_out = n_hidden,
                            activation = activation,
                            decay = args.decay,
                            order = args.order
                        )
            elif args.layer.lower() == "rcnn":
                layer = RCNN(
                            n_in = n_hidden if i > 0 else n_in,
                            n_out = n_hidden,
                            activation = activation,
                            order = args.order,
                            mode = args.mode
                        )
            else:
                raise Exception("unknown layer type: {}".format(args.layer))

            layers.append(layer)
            prev_output = layer.forward_all(prev_output)
            if pooling:
                softmax_inputs.append(T.sum(prev_output, axis=0)) # summing over columns
            else:
                softmax_inputs.append(prev_output[-1])
            prev_output = apply_dropout(prev_output, dropout)
            size += n_hidden

        # final feature representation is the concatenation of all extraction layers
        if pooling:
            softmax_input = T.concatenate(softmax_inputs, axis=1) / x.shape[0]
        else:
            softmax_input = T.concatenate(softmax_inputs, axis=1)
        softmax_input = apply_dropout(softmax_input, dropout, v2=True)

        # feed the feature repr. to the softmax output layer
        layers.append( Layer(
                n_in = size,
                n_out = self.nclasses,
                activation = softmax,
                has_bias = False
        ) )

        for l,i in zip(layers, range(len(layers))):
            say("layer {}: n_in={}\tn_out={}\n".format(
                i, l.n_in, l.n_out
            ))

        # unnormalized score of y given x
        self.p_y_given_x = layers[-1].forward(softmax_input)
        self.pred = T.argmax(self.p_y_given_x, axis=1)
        self.nll_loss = T.mean( T.nnet.categorical_crossentropy(
                                    self.p_y_given_x,
                                    y
                            ))

        # adding regularizations
        self.l2_sqr = None
        self.params = [ ]
        for layer in layers:
            self.params += layer.params
        for p in self.params:
            if self.l2_sqr is None:
                self.l2_sqr = args.l2_reg * T.sum(p**2)
            else:
                self.l2_sqr += args.l2_reg * T.sum(p**2)

        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                        for x in self.params)
        say("total # parameters: {}\n".format(nparams))
Ejemplo n.º 8
0
    def ready(self, args, train):
        # len * batch
        depth = args["depth"]
        self.args = args
        self.idxs = T.imatrix()
        self.idys = T.imatrix()
        self.init_state = [
            T.matrix(dtype=theano.config.floatX) for i in xrange(depth * 2)
        ]

        dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX)
        self.dropout = theano.shared(dropout_prob)
        rnn_dropout_prob = np.float64(args["rnn_dropout"]).astype(
            theano.config.floatX)
        self.rnn_dropout = theano.shared(rnn_dropout_prob)

        self.n_d = args["hidden_dim"]

        embedding_layer = EmbeddingLayer(n_d=self.n_d,
                                         vocab=set(w for w in train))
        self.n_V = embedding_layer.n_V

        say("Vocab size: {}\tHidden dim: {}\n".format(self.n_V, self.n_d))

        activation = get_activation_by_name(args["activation"])

        layers = self.layers = []
        for i in xrange(depth):
            rnn_layer = KernelNN(n_in=self.n_d,
                                 n_out=self.n_d,
                                 activation=activation,
                                 highway=args["highway"],
                                 dropout=self.rnn_dropout)
            layers.append(rnn_layer)

        output_layer = Layer(
            n_in=self.n_d,
            n_out=self.n_V,
            activation=T.nnet.softmax,
        )
        output_layer.W = embedding_layer.embeddings.T

        # (len*batch) * n_d
        x_flat = embedding_layer.forward(self.idxs.ravel())

        # len * batch * n_d
        x = apply_dropout(x_flat, self.dropout)
        #x = x_flat
        x = x.reshape((self.idxs.shape[0], self.idxs.shape[1], self.n_d))

        # len * batch * (n_d+n_d)
        self.last_state = []
        prev_h = x
        for i in xrange(depth):
            hidden = self.init_state[i * 2:i * 2 + 2]
            c, h = layers[i].forward_all(prev_h, hidden, return_c=True)
            self.last_state += [c[-1], h[-1]]
            prev_h = h

        prev_h = apply_dropout(prev_h, self.dropout)
        self.p_y_given_x = output_layer.forward(prev_h.reshape(x_flat.shape))

        idys = self.idys.ravel()
        self.nll = T.nnet.categorical_crossentropy(self.p_y_given_x, idys)

        self.params = [x for l in layers for x in l.params]
        self.params += [embedding_layer.embeddings, output_layer.b]
        self.num_params = sum(
            len(x.get_value(borrow=True).ravel()) for x in self.params)
        say("# of params in total: {}\n".format(self.num_params))
        layers += [embedding_layer, output_layer]
Ejemplo n.º 9
0
    def ready_one_domain(self, idxs, idys, dom_ids, gold_rels, \
                         cnn_layer, rel_hid_layer, rel_out_layer, trans_layer, \
                         dom_hid_layer, dom_out_layer, lab_hid_layer, lab_out_layer):
        dropout = self.dropout
        embedding_layer = self.embedding_layer
        n_d = self.n_d
        n_e = self.n_e
        len_sentence, len_document, batch = idxs.shape
        dw = theano.shared(np.float64(0.4).astype(theano.config.floatX))

        # (len(sent)*len(doc)*batch)*n_e
        sent_input_flat = embedding_layer.forward(idxs.ravel())
        sent_input_flat = apply_dropout(sent_input_flat, dropout)
        
        # len(sent)*(len(doc)*batch)*n_e
        sent_input = sent_input_flat.reshape((len_sentence, len_document*batch, n_e))

        # len(sent) * (len(doc)*batch) * n_d
        sent_output = cnn_layer.forward_all(sent_input)
        
        # reconstruction loss
        recon_loss = self.get_recon_loss(idxs, sent_output)
        
        # max pooling, (len(doc)*batch) * n_d
        sent_embedding = T.max(sent_output, axis=0)
        sent_embedding = apply_dropout(sent_embedding, dropout)

        # relevance score, (len(doc)*batch) * 2
        rel_score = rel_out_layer.forward(rel_hid_layer.forward(sent_embedding)).ravel()
        
        # relevance loss
        gold_rels = gold_rels.ravel()
        rel_mask = T.cast(T.neq(gold_rels, REL_PAD), theano.config.floatX)
        gold_rel_mask = rel_mask * T.cast(T.neq(gold_rels, REL_UNK), theano.config.floatX)
        rel_loss = T.sum((gold_rels - rel_score) ** 2 * gold_rel_mask)
        n_rel_loss = batch
        rel_loss = rel_loss / n_rel_loss
        
        # document embedding via weighted combination, batch * n_d
        rel_score = (rel_score * rel_mask).reshape((len_document, batch))
        weighted_sent_embedding = sent_embedding.reshape((len_document, batch, n_d)) * rel_score.dimshuffle((0, 1, 'x'))
        n = T.sum(rel_score, axis=0) + 1e-8 * T.sum(rel_mask, axis=0)
        orig_doc_embedding = T.sum(weighted_sent_embedding, axis=0) / n.dimshuffle((0, 'x'))
        
        # transform document embedding, batch * n_d
        doc_embedding = trans_layer.forward(orig_doc_embedding)
        
        # domain prediction. batch * 2 
        dom_logprob = dom_out_layer.forward(apply_dropout(dom_hid_layer.forward(doc_embedding), dropout))

        # domain loss
        dom_loss = -dom_logprob[T.arange(batch), dom_ids]
        dom_loss = dw * T.mean(dom_loss)

        # domain adv loss
        adv_loss = self.rho * (-dom_loss)

        # label prediction. batch * n_c
        lab_logprob = lab_out_layer.forward(apply_dropout(lab_hid_layer.forward(doc_embedding), dropout))
        lab_prob = T.exp(lab_logprob)
        
        # label loss
        lab_loss = -lab_logprob[T.arange(batch), idys]
        lab_loss = T.mean(lab_loss)
        
        return lab_loss, rel_loss, dom_loss, adv_loss, lab_prob, recon_loss
Ejemplo n.º 10
0
    def ready(self):
        args = self.args
        embedding_layer = self.embedding_layer
        self.n_hidden = args.hidden_dim
        self.n_in = embedding_layer.n_d
        dropout = self.dropout = theano.shared(
            np.float64(args.dropout_rate).astype(theano.config.floatX))

        # x is length * batch_size
        # y is batch_size
        self.x = T.imatrix('x')
        self.y = T.ivector('y')

        x = self.x
        y = self.y
        n_hidden = self.n_hidden
        n_in = self.n_in

        # fetch word embeddings
        # (len * batch_size) * n_in
        slices = embedding_layer.forward(x.ravel())
        self.slices = slices

        # 3-d tensor, len * batch_size * n_in
        slices = slices.reshape((x.shape[0], x.shape[1], n_in))

        # stacking the feature extraction layers
        pooling = args.pooling
        depth = args.depth
        layers = self.layers = []
        prev_output = slices
        prev_output = apply_dropout(prev_output, dropout, v2=True)
        size = 0
        softmax_inputs = []
        activation = get_activation_by_name(args.act)
        for i in range(depth):
            if args.layer.lower() == "lstm":
                layer = LSTM(n_in=n_hidden if i > 0 else n_in, n_out=n_hidden)
            elif args.layer.lower() == "strcnn":
                layer = StrCNN(n_in=n_hidden if i > 0 else n_in,
                               n_out=n_hidden,
                               activation=activation,
                               decay=args.decay,
                               order=args.order)
            elif args.layer.lower() == "rcnn":
                layer = RCNN(n_in=n_hidden if i > 0 else n_in,
                             n_out=n_hidden,
                             activation=activation,
                             order=args.order,
                             mode=args.mode)
            else:
                raise Exception("unknown layer type: {}".format(args.layer))

            layers.append(layer)
            prev_output = layer.forward_all(prev_output)
            if pooling:
                softmax_inputs.append(T.sum(prev_output,
                                            axis=0))  # summing over columns
            else:
                softmax_inputs.append(prev_output[-1])
            prev_output = apply_dropout(prev_output, dropout)
            size += n_hidden

        # final feature representation is the concatenation of all extraction layers
        if pooling:
            softmax_input = T.concatenate(softmax_inputs, axis=1) / x.shape[0]
        else:
            softmax_input = T.concatenate(softmax_inputs, axis=1)
        softmax_input = apply_dropout(softmax_input, dropout, v2=True)

        # feed the feature repr. to the softmax output layer
        layers.append(
            Layer(n_in=size,
                  n_out=self.nclasses,
                  activation=softmax,
                  has_bias=False))

        for l, i in zip(layers, range(len(layers))):
            say("layer {}: n_in={}\tn_out={}\n".format(i, l.n_in, l.n_out))

        # unnormalized score of y given x
        self.p_y_given_x = layers[-1].forward(softmax_input)
        self.pred = T.argmax(self.p_y_given_x, axis=1)
        self.nll_loss = T.mean(
            T.nnet.categorical_crossentropy(self.p_y_given_x, y))

        # adding regularizations
        self.l2_sqr = None
        self.params = []
        for layer in layers:
            self.params += layer.params
        for p in self.params:
            if self.l2_sqr is None:
                self.l2_sqr = args.l2_reg * T.sum(p**2)
            else:
                self.l2_sqr += args.l2_reg * T.sum(p**2)

        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                        for x in self.params)
        say("total # parameters: {}\n".format(nparams))
Ejemplo n.º 11
0
    def ready(self):
        encoder = self.encoder
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = encoder.dropout

        # len*batch
        x = self.x = encoder.x
        z = self.z = encoder.z

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = [ ]
        layer_type = args.layer.lower()
        for i in xrange(2):
            if layer_type == "rcnn":
                l = RCNN(
                        n_in = n_e,# if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation,
                        order = args.order
                    )
            elif layer_type == "lstm":
                l = LSTM(
                        n_in = n_e,# if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation
                    )
            layers.append(l)

        # len * batch
        #masks = T.cast(T.neq(x, padding_id), theano.config.floatX)
        masks = T.cast(T.neq(x, padding_id), "int8").dimshuffle((0,1,"x"))

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)

        flipped_embs = embs[::-1]

        # len*bacth*n_d
        h1 = layers[0].forward_all(embs)
        h2 = layers[1].forward_all(flipped_embs)
        h_final = T.concatenate([h1, h2[::-1]], axis=2)
        h_final = apply_dropout(h_final, dropout)
        size = n_d * 2

        output_layer = self.output_layer = Layer(
                n_in = size,
                n_out = 1,
                activation = sigmoid
            )

        # len*batch*1
        probs = output_layer.forward(h_final)

        # len*batch
        probs2 = probs.reshape(x.shape)
        self.MRG_rng = MRG_RandomStreams()
        z_pred = self.z_pred = T.cast(self.MRG_rng.binomial(size=probs2.shape, p=probs2), "int8")

        # we are computing approximated gradient by sampling z;
        # so should mark sampled z not part of the gradient propagation path
        #
        self.z_pred = theano.gradient.disconnected_grad(z_pred)

        z2 = z.dimshuffle((0,1,"x"))
        logpz = - T.nnet.binary_crossentropy(probs, z2) * masks
        logpz = self.logpz = logpz.reshape(x.shape)
        probs = self.probs = probs.reshape(x.shape)

        # batch
        zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
        zdiff = T.sum(T.abs_(z[1:]-z[:-1]), axis=0, dtype=theano.config.floatX)

        loss_mat = encoder.loss_mat
        if args.aspect < 0:
            loss_vec = T.mean(loss_mat, axis=1)
        else:
            assert args.aspect < self.nclasses
            loss_vec = loss_mat[:,args.aspect]
        self.loss_vec = loss_vec

        coherent_factor = args.sparsity * args.coherent
        loss = self.loss = T.mean(loss_vec)
        sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \
                                             T.mean(zdiff) * coherent_factor
        cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor
        cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0))
        self.obj = T.mean(cost_vec)

        params = self.params = [ ]
        for l in layers + [ output_layer ]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg

        cost = self.cost = cost_logpz * 10 + l2_cost
        print "cost.dtype", cost.dtype

        self.cost_e = loss * 10 + encoder.l2_cost
Ejemplo n.º 12
0
    def ready(self):
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))

        # len*batch
        x = self.x = T.imatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = []
        layer_type = args.layer.lower()
        for i in xrange(2):
            if layer_type == "rcnn":
                l = RCNN(n_in=n_e,
                         n_out=n_d,
                         activation=activation,
                         order=args.order)
            elif layer_type == "lstm":
                l = LSTM(n_in=n_e, n_out=n_d, activation=activation)
            layers.append(l)

        # len * batch
        masks = T.cast(T.neq(x, padding_id), theano.config.floatX)

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)
        self.word_embs = embs

        flipped_embs = embs[::-1]

        # len*bacth*n_d
        h1 = layers[0].forward_all(embs)
        h2 = layers[1].forward_all(flipped_embs)
        h_final = T.concatenate([h1, h2[::-1]], axis=2)
        h_final = apply_dropout(h_final, dropout)
        size = n_d * 2

        output_layer = self.output_layer = ZLayer(
            n_in=size, n_hidden=args.hidden_dimension2, activation=activation)

        # sample z given text (i.e. x)
        z_pred, sample_updates = output_layer.sample_all(h_final)

        # we are computing approximated gradient by sampling z;
        # so should mark sampled z not part of the gradient propagation path
        #
        z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred)
        self.sample_updates = sample_updates
        print "z_pred", z_pred.ndim

        probs = output_layer.forward_all(h_final, z_pred)
        print "probs", probs.ndim

        logpz = -T.nnet.binary_crossentropy(probs, z_pred) * masks
        logpz = self.logpz = logpz.reshape(x.shape)
        probs = self.probs = probs.reshape(x.shape)

        # batch
        z = z_pred
        self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
        self.zdiff = T.sum(T.abs_(z[1:] - z[:-1]),
                           axis=0,
                           dtype=theano.config.floatX)

        params = self.params = []
        for l in layers + [output_layer]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost
Ejemplo n.º 13
0
    def ready(self):
        encoder = self.encoder
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = encoder.dropout

        # len*batch
        x = self.x = encoder.x
        z = self.z = encoder.z

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = []
        layer_type = args.layer.lower()
        for i in range(2):
            if layer_type == "rcnn":
                l = RCNN(
                    n_in=n_e,  # if i == 0 else n_d,
                    n_out=n_d,
                    activation=activation,
                    order=args.order)
            elif layer_type == "lstm":
                l = LSTM(
                    n_in=n_e,  # if i == 0 else n_d,
                    n_out=n_d,
                    activation=activation)
            layers.append(l)

        # len * batch
        #masks = T.cast(T.neq(x, padding_id), theano.config.floatX)
        masks = T.cast(T.neq(x, padding_id), "int8").dimshuffle((0, 1, "x"))

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)

        flipped_embs = embs[::-1]

        # len*bacth*n_d
        h1 = layers[0].forward_all(embs)
        h2 = layers[1].forward_all(flipped_embs)
        h_final = T.concatenate([h1, h2[::-1]], axis=2)
        h_final = apply_dropout(h_final, dropout)
        size = n_d * 2

        output_layer = self.output_layer = Layer(n_in=size,
                                                 n_out=1,
                                                 activation=sigmoid)

        # len*batch*1
        probs = output_layer.forward(h_final)

        # len*batch
        probs2 = probs.reshape(x.shape)
        self.MRG_rng = MRG_RandomStreams()
        z_pred = self.z_pred = T.cast(
            self.MRG_rng.binomial(size=probs2.shape, p=probs2), "int8")

        # we are computing approximated gradient by sampling z;
        # so should mark sampled z not part of the gradient propagation path
        #
        self.z_pred = theano.gradient.disconnected_grad(z_pred)

        z2 = z.dimshuffle((0, 1, "x"))
        logpz = -T.nnet.binary_crossentropy(probs, z2) * masks
        logpz = self.logpz = logpz.reshape(x.shape)
        probs = self.probs = probs.reshape(x.shape)

        # batch
        zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
        zdiff_pre = (z[1:] - z[:-1]) * 1.0
        zdiff = T.sum(abs(zdiff_pre), axis=0, dtype=theano.config.floatX)

        loss_mat = encoder.loss_mat
        if args.aspect < 0:
            loss_vec = T.mean(loss_mat, axis=1)
        else:
            assert args.aspect < self.nclasses
            loss_vec = loss_mat[:, args.aspect]
        self.loss_vec = loss_vec

        coherent_factor = args.sparsity * args.coherent
        loss = self.loss = T.mean(loss_vec)
        sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \
                                             T.mean(zdiff) * coherent_factor
        cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor
        cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0))
        self.obj = T.mean(cost_vec)

        params = self.params = []
        for l in layers + [output_layer]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg

        cost = self.cost = cost_logpz * 10 + l2_cost
        print("cost.dtype", cost.dtype)

        self.cost_e = loss * 10 + encoder.l2_cost
Ejemplo n.º 14
0
    def ready(self):
        generator = self.generator
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = generator.dropout

        # len*batch
        x = generator.x
        z = generator.z_pred
        z = z.dimshuffle((0,1,"x"))

        # batch*nclasses
        y = self.y = T.fmatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = [ ]
        depth = args.depth
        layer_type = args.layer.lower()
        for i in xrange(depth):
            if layer_type == "rcnn":
                l = ExtRCNN(
                        n_in = n_e if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation,
                        order = args.order
                    )
            elif layer_type == "lstm":
                l = ExtLSTM(
                        n_in = n_e if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation
                    )
            layers.append(l)

        # len * batch * 1
        masks = T.cast(T.neq(x, padding_id).dimshuffle((0,1,"x")) * z, theano.config.floatX)
        # batch * 1
        cnt_non_padding = T.sum(masks, axis=0) + 1e-8

        # len*batch*n_e
        embs = generator.word_embs

        pooling = args.pooling
        lst_states = [ ]
        h_prev = embs
        for l in layers:
            # len*batch*n_d
            h_next = l.forward_all(h_prev, z)
            if pooling:
                # batch * n_d
                masked_sum = T.sum(h_next * masks, axis=0)
                lst_states.append(masked_sum/cnt_non_padding) # mean pooling
            else:
                lst_states.append(h_next[-1]) # last state
            h_prev = apply_dropout(h_next, dropout)

        if args.use_all:
            size = depth * n_d
            # batch * size (i.e. n_d*depth)
            h_final = T.concatenate(lst_states, axis=1)
        else:
            size = n_d
            h_final = lst_states[-1]
        h_final = apply_dropout(h_final, dropout)

        output_layer = self.output_layer = Layer(
                n_in = size,
                n_out = self.nclasses,
                activation = sigmoid
            )

        # batch * nclasses
        preds = self.preds = output_layer.forward(h_final)

        # batch
        loss_mat = self.loss_mat = (preds-y)**2

        pred_diff = self.pred_diff = T.mean(T.max(preds, axis=1) - T.min(preds, axis=1))

        if args.aspect < 0:
            loss_vec = T.mean(loss_mat, axis=1)
        else:
            assert args.aspect < self.nclasses
            loss_vec = loss_mat[:,args.aspect]
        self.loss_vec = loss_vec

        zsum = generator.zsum
        zdiff = generator.zdiff
        logpz = generator.logpz

        coherent_factor = args.sparsity * args.coherent
        loss = self.loss = T.mean(loss_vec)
        sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \
                                             T.mean(zdiff) * coherent_factor
        cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor
        cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0))
        self.obj = T.mean(cost_vec)

        params = self.params = [ ]
        for l in layers + [ output_layer ]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost

        self.cost_g = cost_logpz * 10 + generator.l2_cost
        self.cost_e = loss * 10 + l2_cost
Ejemplo n.º 15
0
    def ready(self):
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = theano.shared(
                np.float64(args.dropout).astype(theano.config.floatX)
            )

        # len*batch
        x = self.x = T.imatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = [ ]
        layer_type = args.layer.lower()
        for i in xrange(2):
            if layer_type == "rcnn":
                l = RCNN(
                        n_in = n_e,
                        n_out = n_d,
                        activation = activation,
                        order = args.order
                    )
            elif layer_type == "lstm":
                l = LSTM(
                        n_in = n_e,
                        n_out = n_d,
                        activation = activation
                    )
            layers.append(l)

        # len * batch
        masks = T.cast(T.neq(x, padding_id), theano.config.floatX)

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)
        self.word_embs = embs

        flipped_embs = embs[::-1]

        # len*bacth*n_d
        h1 = layers[0].forward_all(embs)
        h2 = layers[1].forward_all(flipped_embs)
        h_final = T.concatenate([h1, h2[::-1]], axis=2)
        h_final = apply_dropout(h_final, dropout)
        size = n_d * 2

        output_layer = self.output_layer = ZLayer(
                n_in = size,
                n_hidden = args.hidden_dimension2,
                activation = activation
            )

        # sample z given text (i.e. x)
        z_pred, sample_updates = output_layer.sample_all(h_final)

        # we are computing approximated gradient by sampling z;
        # so should mark sampled z not part of the gradient propagation path
        #
        z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred)
        self.sample_updates = sample_updates
        print "z_pred", z_pred.ndim

        probs = output_layer.forward_all(h_final, z_pred)
        print "probs", probs.ndim

        logpz = - T.nnet.binary_crossentropy(probs, z_pred) * masks
        logpz = self.logpz = logpz.reshape(x.shape)
        probs = self.probs = probs.reshape(x.shape)

        # batch
        z = z_pred
        self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
        self.zdiff = T.sum(T.abs_(z[1:]-z[:-1]), axis=0, dtype=theano.config.floatX)

        params = self.params = [ ]
        for l in layers + [ output_layer ]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost
Ejemplo n.º 16
0
    def ready(self):
        generator = self.generator
        args = self.args
        weights = self.weights

        dropout = generator.dropout

        # len(text) * batch
        idts = generator.x
        z = generator.z_pred
        z = z.dimshuffle((0, 1, "x"))

        # batch * 2
        pairs = self.pairs = T.imatrix()

        # num pairs * 3, or num queries * candidate size
        triples = self.triples = T.imatrix()

        embedding_layer = self.embedding_layer

        activation = get_activation_by_name(args.activation)
        n_d = self.n_d = args.hidden_dim
        n_e = self.n_e = embedding_layer.n_d

        if args.layer.lower() == "rcnn":
            LayerType = RCNN
            LayerType2 = ExtRCNN
        elif args.layer.lower() == "lstm":
            LayerType = LSTM
            LayerType2 = ExtLSTM
        #elif args.layer.lower() == "gru":
        #    LayerType = GRU

        depth = self.depth = args.depth
        layers = self.layers = []
        for i in range(depth):
            if LayerType != RCNN:
                feature_layer = LayerType(n_in=n_e if i == 0 else n_d,
                                          n_out=n_d,
                                          activation=activation)
            else:
                feature_layer = LayerType(n_in=n_e if i == 0 else n_d,
                                          n_out=n_d,
                                          activation=activation,
                                          order=args.order,
                                          mode=args.mode,
                                          has_outgate=args.outgate)
            layers.append(feature_layer)

        extlayers = self.extlayers = []
        for i in range(depth):
            if LayerType != RCNN:
                feature_layer = LayerType2(n_in=n_e if i == 0 else n_d,
                                           n_out=n_d,
                                           activation=activation)
            else:
                feature_layer = LayerType2(n_in=n_e if i == 0 else n_d,
                                           n_out=n_d,
                                           activation=activation,
                                           order=args.order,
                                           mode=args.mode,
                                           has_outgate=args.outgate)
            feature_layer.copy_params(layers[i])
            extlayers.append(feature_layer)

        # feature computation starts here

        xt = generator.word_embs

        # encode full text into representation
        prev_ht = self.xt = xt
        for i in range(depth):
            # len*batch*n_d
            ht = layers[i].forward_all(prev_ht)
            prev_ht = ht

        # encode selected text into representation
        prev_htz = self.xt = xt
        for i in range(depth):
            # len*batch*n_d
            htz = extlayers[i].forward_all(prev_htz, z)
            prev_htz = htz

        # normalize vectors
        if args.normalize:
            ht = self.normalize_3d(ht)
            htz = self.normalize_3d(htz)
            say("h_title dtype: {}\n".format(ht.dtype))

        self.ht = ht
        self.htz = htz

        # average over length, ignore paddings
        # batch * d
        if args.average:
            ht = self.average_without_padding(ht, idts)
            htz = self.average_without_padding(htz, idts, z)
        else:
            ht = ht[-1]
            htz = htz[-1]
        say("h_avg_title dtype: {}\n".format(ht.dtype))

        # batch * d
        h_final = apply_dropout(ht, dropout)
        h_final = self.normalize_2d(h_final)
        hz_final = apply_dropout(htz, dropout)
        hz_final = self.normalize_2d(hz_final)
        self.h_final = h_final
        self.hz_final = hz_final

        say("h_final dtype: {}\n".format(ht.shape))

        # For testing:
        #   first one in batch is query, the rest are candidate questions
        self.scores = T.dot(h_final[1:], h_final[0])
        self.scores_z = T.dot(hz_final[1:], hz_final[0])

        # For training encoder:
        xp = h_final[triples.ravel()]
        xp = xp.reshape((triples.shape[0], triples.shape[1], n_d))
        # num query * n_d
        query_vecs = xp[:, 0, :]
        # num query
        pos_scores = T.sum(query_vecs * xp[:, 1, :], axis=1)
        # num query * candidate size
        neg_scores = T.sum(query_vecs.dimshuffle((0, 'x', 1)) * xp[:, 2:, :],
                           axis=2)
        # num query
        neg_scores = T.max(neg_scores, axis=1)
        diff = neg_scores - pos_scores + 1.0
        hinge_loss = T.mean((diff > 0) * diff)

        # For training generator

        # batch
        self_cosine_distance = 1.0 - T.sum(hz_final * h_final, axis=1)
        pair_cosine_distance = 1.0 - T.sum(hz_final * h_final[pairs[:, 1]],
                                           axis=1)
        alpha = args.alpha
        loss_vec = self_cosine_distance * alpha + pair_cosine_distance * (
            1 - alpha)
        #loss_vec = self_cosine_distance*0.2 + pair_cosine_distance*0.8

        zsum = generator.zsum
        zdiff = generator.zdiff
        logpz = generator.logpz

        sfactor = args.sparsity
        cfactor = args.sparsity * args.coherent
        scost_vec = zsum * sfactor + zdiff * cfactor

        # batch
        cost_vec = loss_vec + scost_vec
        cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0))
        loss = self.loss = T.mean(loss_vec)
        sparsity_cost = self.sparsity_cost = T.mean(scost_vec)
        self.obj = loss + sparsity_cost

        params = []
        for l in self.layers:
            params += l.params
        self.params = params
        say("num of parameters: {}\n".format(
            sum(len(x.get_value(borrow=True).ravel()) for x in params)))

        l2_reg = None
        for p in params:
            if l2_reg is None:
                l2_reg = T.sum(p**2)  #p.norm(2)
            else:
                l2_reg = l2_reg + T.sum(p**2)  #p.norm(2)
        l2_reg = l2_reg * args.l2_reg
        self.l2_cost = l2_reg

        beta = args.beta
        self.cost_g = cost_logpz + generator.l2_cost
        self.cost_e = hinge_loss + loss * beta + l2_reg
Ejemplo n.º 17
0
    def ready(self):
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = theano.shared(
                np.float64(args.dropout).astype(theano.config.floatX)
            )

        # len*batch
        x = self.x = T.imatrix()

        z = self.z = T.bmatrix()
        z = z.dimshuffle((0,1,"x"))

        # batch*nclasses
        y = self.y = T.fmatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = [ ]
        depth = args.depth
        layer_type = args.layer.lower()
        for i in xrange(depth):
            if layer_type == "rcnn":
                l = ExtRCNN(
                        n_in = n_e if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation,
                        order = args.order
                    )
            elif layer_type == "lstm":
                l = ExtLSTM(
                        n_in = n_e if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation
                    )
            layers.append(l)

        # len * batch * 1
        masks = T.cast(T.neq(x, padding_id).dimshuffle((0,1,"x")) * z, theano.config.floatX)
        # batch * 1
        cnt_non_padding = T.sum(masks, axis=0) + 1e-8

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)

        pooling = args.pooling
        lst_states = [ ]
        h_prev = embs
        for l in layers:
            # len*batch*n_d
            h_next = l.forward_all(h_prev, z)
            if pooling:
                # batch * n_d
                masked_sum = T.sum(h_next * masks, axis=0)
                lst_states.append(masked_sum/cnt_non_padding) # mean pooling
            else:
                lst_states.append(h_next[-1]) # last state
            h_prev = apply_dropout(h_next, dropout)

        if args.use_all:
            size = depth * n_d
            # batch * size (i.e. n_d*depth)
            h_final = T.concatenate(lst_states, axis=1)
        else:
            size = n_d
            h_final = lst_states[-1]
        h_final = apply_dropout(h_final, dropout)

        output_layer = self.output_layer = Layer(
                n_in = size,
                n_out = self.nclasses,
                activation = sigmoid
            )

        # batch * nclasses
        preds = self.preds = output_layer.forward(h_final)

        # batch
        loss_mat = self.loss_mat = (preds-y)**2
        loss = self.loss = T.mean(loss_mat)

        pred_diff = self.pred_diff = T.mean(T.max(preds, axis=1) - T.min(preds, axis=1))

        params = self.params = [ ]
        for l in layers + [ output_layer ]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost

        cost = self.cost = loss * 10 + l2_cost
Ejemplo n.º 18
0
    def ready(self):
        args = self.args
        embedding_layer = self.embedding_layer
        user_embedding_layer = self.user_embedding_layer
        self.n_hidden = args.hidden_dim
        self.n_in = embedding_layer.n_d
        dropout = self.dropout = theano.shared(
            np.float64(args.dropout_rate).astype(theano.config.floatX)
        )

        # x is length * batch_size
        # y is batch_size
        self.x = T.imatrix('x')
        self.w_masks = T.fmatrix('mask')
        self.w_lens = T.fvector('lens')
        self.s_ml = T.iscalar('sent_maxlen')
        self.s_num = T.iscalar('sent_num')
        self.y = T.ivector('y')
        self.usr = T.ivector('users')

        x = self.x
        y = self.y
        usr = self.usr
        w_masks = self.w_masks
        w_lens = self.w_lens
        s_ml = self.s_ml
        s_num = self.s_num
        n_hidden = self.n_hidden
        n_emb = n_in = self.n_in

        layers = self.layers = []

        slicesu = user_embedding_layer.forward(usr)
        slices = embedding_layer.forward(x.ravel())
        self.slices = slices  # important for updating word embeddings

        # 3-d tensor, len * batch_size * n_in
        slices = slices.reshape((x.shape[0], x.shape[1], n_in))

        pooling = args.pooling
        prev_output = slices
        prev_output = apply_dropout(prev_output, dropout, v2=True)
        size = 0

        n_hidden_t = n_hidden
        if args.direction == "bi":
            n_hidden_t = 2 * n_hidden

        softmax_inputs = []
        activation = get_activation_by_name(args.act)

        if args.layer.lower() == "lstm":
            layer = LSTM(n_in=n_in,
                         n_out=n_hidden_t,
                         direction=args.direction
                         )
        elif args.layer.lower() == "cnn":
            layer = CNN(n_in=n_in,
                        n_out=n_hidden_t,
                        activation=activation,
                        order=args.order
                        )
        else:
            raise Exception("unknown layer type: {}".format(args.layer))

        layers.append(layer)
        prev_output = layer.forward_all(prev_output, masks=w_masks)
        prev_output = apply_dropout(prev_output, dropout)

        # final feature representation is the concatenation of all extraction layers
        if args.user_atten:
            layer = IterAttentionLayer(
                n_in=n_emb,
                n_out=n_hidden_t
            )
            layers.append(layer)
            if args.user_atten_base:
                slicesu = None
            softmax_input = layers[-1].multi_hop_forward(
                prev_output, user_embs=slicesu, isWord=True, masks=w_masks)
        else:
            if pooling:
                softmax_input = T.sum(prev_output, axis=0) / w_lens.dimshuffle(0, 'x')
            else:
                ind = T.cast(w_lens - T.ones_like(w_lens), 'int32')
                softmax_input = prev_output[T.arange(ind.shape[0]), ind]

        softmax_input = apply_dropout(softmax_input, dropout, v2=True)

        n_in = n_hidden_t
        size = 0
        softmax_inputs = []
        [sentlen, emblen] = T.shape(softmax_input)
        prev_output = softmax_input.reshape(
            (sentlen / s_num, s_num, emblen)).dimshuffle(1, 0, 2)
        if args.layer.lower() == "lstm":
            layer = LSTM(n_in=n_in,
                         n_out=n_hidden_t,
                         direction=args.direction
                         )
        elif args.layer.lower() == "cnn":
            layer = CNN(n_in=n_in,
                        n_out=n_hidden_t,
                        activation=activation,
                        order=args.order,
                        )
        else:
            raise Exception("unknown layer type: {}".format(args.layer))

        layers.append(layer)
        prev_output = layer.forward_all(prev_output)
        prev_output = apply_dropout(prev_output, dropout)

        if args.user_atten:
            layer = IterAttentionLayer(
                n_in=n_emb,
                n_out=n_hidden_t
            )
            layers.append(layer)

            if args.user_atten_base:
                slicesu = None
            softmax_input = layers[-1].multi_hop_forward(
                prev_output, user_embs=slicesu, isWord=False)
        else:
            if pooling:
                softmax_input = T.sum(prev_output, axis=0) / \
                    T.cast(s_num, 'float32')
            else:
                softmax_input = prev_output[-1]
        softmax_input = apply_dropout(softmax_input, dropout, v2=True)

        size = n_hidden_t
        layers.append(Layer(
            n_in=size,
            n_out=self.nclasses,
            activation=softmax,
            has_bias=False
        ))
        if not args.fix_emb:
            for l, i in zip(layers, range(len(layers))):
                say("layer {}: n_in={}\tn_out={}\n".format(
                    i, l.n_in, l.n_out
                ))
        else:
            for l, i in zip(layers[1:], range(len(layers[1:]))):
                say("layer {}: n_in={}\tn_out={}\n".format(
                    i, l.n_in, l.n_out
                ))

        # unnormalized score of y given x
        self.p_y_given_x = layers[-1].forward(softmax_input)
        self.pred = T.argmax(self.p_y_given_x, axis=1)
        self.nll_loss = T.mean(T.nnet.categorical_crossentropy(
            self.p_y_given_x,
            y
        ))

        # adding regularizations
        self.l2_sqr = None
        self.params = []
        for layer in layers:
            self.params += layer.params
        for p in self.params:
            if self.l2_sqr is None:
                self.l2_sqr = args.l2_reg * T.sum(p**2)
            else:
                self.l2_sqr += args.l2_reg * T.sum(p**2)

        nparams = sum(len(x.get_value(borrow=True).ravel())
                      for x in self.params)
        say("total # parameters: {}\n".format(nparams))
Ejemplo n.º 19
0
    def ready(self):
	args = self.args
	w_emb_layer = self.w_emb_layer
	c_emb_layer = self.c_emb_layer
	r_emb_layers = self.r_emb_layers
	r_matrix_layers = self.r_matrix_layers	

	char_dim = self.char_dim = args.char_dim
	char_lstm_dim = self.char_lstm_dim = args.char_lstm_dim
	word_dim = self.word_dim = args.word_dim
	word_lstm_dim = self.word_lstm_dim = args.word_lstm_dim
	
	dropout = self.dropout = theano.shared(
                np.float64(args.dropout).astype(theano.config.floatX)
            )

	word_ids = self.word_ids = T.ivector('word_ids')
	char_ids = self.char_ids = T.imatrix('char_ids')
	char_lens = self.char_lens = T.fvector('char_lens')
	char_masks = self.char_masks = T.imatrix('char_masks')
	up_ids = self.up_ids = T.imatrix('up_ids')
	up_rels = self.up_rels = T.imatrix('up_rels')
	up_id_masks = self.up_id_masks = T.imatrix('up_id_masks')
	down_ids = self.down_ids = T.imatrix('down_ids')
	down_rels = self.down_rels = T.imatrix('down_rels')
	down_id_masks = self.down_id_masks = T.imatrix('down_id_masks')
	tag_ids = self.tag_ids = T.ivector('tag_ids')
	
	layers = self.layers = [w_emb_layer, c_emb_layer]
	layers.extend(r_emb_layers)
	layers.extend(r_matrix_layers)	

	inputs = self.inputs = []

	inputs.append(self.word_ids)
	inputs.append(self.char_ids)
	inputs.append(self.char_lens)
	inputs.append(self.char_masks)
	inputs.append(self.up_ids)
	inputs.append(self.up_rels)
	inputs.append(self.up_id_masks)
	inputs.append(self.down_ids)
	inputs.append(self.down_rels)
	inputs.append(self.down_id_masks)
	inputs.append(self.tag_ids)
	wslices = w_emb_layer.forward(word_ids)
	cslices = c_emb_layer.forward(char_ids.ravel())
	cslices = cslices.reshape((char_ids.shape[0], char_ids.shape[1], char_dim))
	cslices = cslices.dimshuffle(1, 0, 2)
	
	bv_ur_slicess = []
        bv_dr_slicess = []
        b_ur_slicess = []
        b_dr_slicess = []
	
	bv_ur_matrixss = []
	bv_dr_matrixss = []
	b_ur_matrixss = []
	b_dr_matrixss = []
	
	for r_matrix_layer in r_matrix_layers:
            bv_ur_matrixs = r_matrix_layer.forward1(up_rels.ravel())
            bv_dr_matrixs = r_matrix_layer.forward1(down_rels.ravel())
            b_ur_matrixs = r_matrix_layer.forward2(up_rels.ravel())
            b_dr_matrixs = r_matrix_layer.forward2(down_rels.ravel())
            bv_ur_matrixss.append(bv_ur_matrixs.reshape((up_rels.shape[0], up_rels.shape[1], word_dim, word_dim)))
            bv_dr_matrixss.append(bv_dr_matrixs.reshape((down_rels.shape[0], down_rels.shape[1], word_dim, word_dim)))
            b_ur_matrixss.append(b_ur_matrixs.reshape((up_rels.shape[0], up_rels.shape[1], word_dim, word_dim)))
            b_dr_matrixss.append(b_dr_matrixs.reshape((down_rels.shape[0], down_rels.shape[1], word_dim, word_dim)))
	
	for r_emb_layer in r_emb_layers:
            bv_ur_slices = r_emb_layer.forward(up_rels.ravel())
            bv_dr_slices = r_emb_layer.forward(down_rels.ravel())
            b_ur_slices = r_emb_layer.forward2(up_rels.ravel())
            b_dr_slices = r_emb_layer.forward2(down_rels.ravel())
            bv_ur_slicess.append(bv_ur_slices.reshape((up_rels.shape[0], up_rels.shape[1], word_dim)))
            bv_dr_slicess.append(bv_dr_slices.reshape((down_rels.shape[0], down_rels.shape[1], word_dim)))
            b_ur_slicess.append(b_ur_slices.reshape((up_rels.shape[0], up_rels.shape[1], word_dim)))
            b_dr_slicess.append(b_dr_slices.reshape((down_rels.shape[0], down_rels.shape[1], word_dim)))

	char_masks = char_masks.dimshuffle(1, 0)

	prev_output = wslices
	prev_size = word_dim

	if char_dim:
	    layers.append(LSTM(
		n_in = char_dim,
		n_out = char_lstm_dim,
		direction = 'bi' if args.char_bidirect else 'si'	
	    ))
	    prev_output_2 = cslices
	    prev_output_2 = apply_dropout(prev_output_2, dropout, v2 = True)
	    prev_output_2 = layers[-1].forward_all(cslices, char_masks)
	    prev_output_2 = T.sum(prev_output_2, axis = 0)
	    prev_output_2 = prev_output_2 / (1e-6 * T.ones_like(char_lens) + char_lens).dimshuffle(0, 'x')

	    prev_size += char_lstm_dim
	    prev_output = T.concatenate([prev_output, prev_output_2], axis = 1)
	
	prev_output = apply_dropout(prev_output, dropout)
	if args.conv != 0:
	    for i in range(args.clayer):
            	layers.append(GKNNMultiHeadGate(
                        n_in = prev_size,
                        n_out = prev_size,
			n_head = args.head
                        ))
	    	prev_output = layers[-1].forward_all(prev_output, up_ids, up_id_masks, bv_ur_slicess[0], down_ids, down_id_masks, bv_dr_slicess[0])
	    	prev_output = apply_dropout(prev_output, dropout)
	
	
	#prev_size *= 2
	#layers.append(LSTM(
	#    n_in = prev_size,
	#    n_out = word_lstm_dim,
	#    direction = 'bi' if args.word_bidirect else 'si'
	#))
	
	#prev_output = prev_output.dimshuffle(0, 'x', 1)
	#prev_output = layers[-1].forward_all(prev_output)
	#prev_output = prev_output.reshape((prev_output.shape[0], prev_output.shape[-1]))
	
	#prev_size = word_lstm_dim
	
	layers.append(Layer(
	    n_in = prev_size,
	    n_out = args.classes,
	    activation = linear, #ReLU,
	    has_bias = False
	))

	n_tags = args.classes
	s_len = char_ids.shape[0]
	tags_scores = layers[-1].forward(prev_output)
	transitions = shared((n_tags + 2, n_tags + 2), 'transitions')
	small = -1000
        b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
        e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
        observations = T.concatenate(
            [tags_scores, small * T.ones((s_len, 2))],
            axis=1
        )
	
        observations = T.concatenate(
            [b_s, observations, e_s],
            axis=0
        )

        real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()
	b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
        e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
        padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
	
	pre_ids = T.arange(s_len + 1)
	
	s_ids = T.arange(s_len + 1) + 1
	
        real_path_score += transitions[
           padded_tags_ids[pre_ids],
           padded_tags_ids[s_ids]
        ].sum()
	
	all_paths_scores = CRFForward(observations, transitions)
        self.nll_loss = nll_loss = - (real_path_score - all_paths_scores)
        preds = CRFForward(observations, transitions, viterbi = True,
                        return_alpha = False, return_best_sequence=True)
        
	self.pred = preds[1:-1]
	
	self.l2_sqr = None
        params = self.params = [transitions]
        for layer in layers:
            self.params += layer.params
        for p in self.params:
            if self.l2_sqr is None:
                self.l2_sqr = args.l2_reg * T.sum(p**2)
            else:
                self.l2_sqr += args.l2_reg * T.sum(p**2)

	
	#for l, i in zip(layers[3:], range(len(layers[3:]))):
        for l, i in zip(layers[2+len(r_emb_layers)+len(r_matrix_layers):], range(len(layers[2+len(r_emb_layers)+len(r_matrix_layers):]))):
	    say("layer {}: n_in={}\tn_out={}\n".format(
                    i, l.n_in, l.n_out
                ))

        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                        for x in self.params)
        say("total # parameters: {}\n".format(nparams))
	
	cost = self.nll_loss + self.l2_sqr

	lr_method_name = args.learning
	lr_method_parameters = {}
	lr_method_parameters['lr'] = args.learning_rate
	updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters)
	
	f_train = theano.function(
	    	inputs = self.inputs,
		outputs = [cost, nll_loss],
		updates = updates,
		allow_input_downcast = True
	)

	f_eval = theano.function(
		inputs = self.inputs[:-1],
		outputs = self.pred,
		allow_input_downcast = True
	)
	
	return f_train, f_eval
Ejemplo n.º 20
0
    def ready(self):
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = theano.shared(
                np.float64(args.dropout).astype(theano.config.floatX)
            )

        # len*batch
        x = self.x = T.imatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = [ ]
        layer_type = args.layer.lower()
        for i in xrange(1):
            l = CNN(
                    n_in = n_e,
                    n_out = n_d,
                    activation = activation,
                    order = args.order
                )
            layers.append(l)

        # len * batch
        masks = T.cast(T.neq(x, padding_id), "int8").dimshuffle((0,1,'x'))

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)
        self.word_embs = embs

        # len*bacth*n_d
        h1 = layers[0].forward_all(embs)
        h_final = h1
        size = n_d
        h_final = apply_dropout(h_final, dropout)

        output_layer = self.output_layer = Layer(
                n_in = size,
                n_out = 1,
                activation = sigmoid
            )

        # len*batch*1
        probs = output_layer.forward(h_final)

        # len*batch
        self.MRG_rng = MRG_RandomStreams()
        z_pred_dim3 = self.MRG_rng.binomial(size=probs.shape, p=probs, dtype="int8")
        z_pred = z_pred_dim3.reshape(x.shape)

        # we are computing approximated gradient by sampling z;
        # so should mark sampled z not part of the gradient propagation path
        #
        z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred)
        print "z_pred", z_pred.ndim

        #logpz = - T.nnet.binary_crossentropy(probs, z_pred_dim3) * masks
        logpz = - T.nnet.binary_crossentropy(probs, z_pred_dim3)
        logpz = self.logpz = logpz.reshape(x.shape)
        probs = self.probs = probs.reshape(x.shape)

        # batch
        z = z_pred
        self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
        self.zdiff = T.sum(T.abs_(z[1:]-z[:-1]), axis=0, dtype=theano.config.floatX)

        params = self.params = [ ]
        for l in layers + [ output_layer ]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost
Ejemplo n.º 21
0
    def ready(self):
        args = self.args
        weights = self.weights

        # len(source) * batch
        idxs = self.idxs = T.imatrix()

        # len(target) * batch
        idys = self.idys = T.imatrix()
        idts = idys[:-1]
        idgs = idys[1:]

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))

        embedding_layer = self.embedding_layer

        activation = get_activation_by_name(args.activation)
        n_d = self.n_d = args.hidden_dim
        n_e = self.n_e = embedding_layer.n_d
        n_V = self.n_V = embedding_layer.n_V

        if args.layer.lower() == "rcnn":
            LayerType = RCNN
        elif args.layer.lower() == "lstm":
            LayerType = LSTM
        elif args.layer.lower() == "gru":
            LayerType = GRU

        depth = self.depth = args.depth
        layers = self.layers = []
        for i in range(depth * 2):
            if LayerType != RCNN:
                feature_layer = LayerType(n_in=n_e if i / 2 == 0 else n_d,
                                          n_out=n_d,
                                          activation=activation)
            else:
                feature_layer = LayerType(n_in=n_e if i / 2 == 0 else n_d,
                                          n_out=n_d,
                                          activation=activation,
                                          order=args.order,
                                          mode=args.mode,
                                          has_outgate=args.outgate)
            layers.append(feature_layer)

        self.output_layer = output_layer = Layer(
            n_in=n_d,
            n_out=n_V,
            activation=T.nnet.softmax,
        )

        # feature computation starts here

        # (len*batch)*n_e
        xs_flat = embedding_layer.forward(idxs.ravel())
        xs_flat = apply_dropout(xs_flat, dropout)
        if weights is not None:
            xs_w = weights[idxs.ravel()].dimshuffle((0, 'x'))
            xs_flat = xs_flat * xs_w
        # len*batch*n_e
        xs = xs_flat.reshape((idxs.shape[0], idxs.shape[1], n_e))

        # (len*batch)*n_e
        xt_flat = embedding_layer.forward(idts.ravel())
        xt_flat = apply_dropout(xt_flat, dropout)
        if weights is not None:
            xt_w = weights[idts.ravel()].dimshuffle((0, 'x'))
            xt_flat = xt_flat * xt_w
        # len*batch*n_e
        xt = xt_flat.reshape((idts.shape[0], idts.shape[1], n_e))

        prev_hs = xs
        prev_ht = xt
        for i in range(depth):
            # len*batch*n_d
            hs = layers[i * 2].forward_all(prev_hs, return_c=True)
            ht = layers[i * 2 + 1].forward_all(prev_ht, hs[-1])
            hs = hs[:, :, -n_d:]
            ht = ht[:, :, -n_d:]
            prev_hs = hs
            prev_ht = ht
            prev_hs = apply_dropout(hs, dropout)
            prev_ht = apply_dropout(ht, dropout)

        self.p_y_given_x = output_layer.forward(
            prev_ht.reshape((xt_flat.shape[0], n_d)))

        h_final = hs[-1]
        self.scores2 = -(h_final[1:] - h_final[0]).norm(2, axis=1)
        h_final = self.normalize_2d(h_final)
        self.scores = T.dot(h_final[1:], h_final[0])

        # (len*batch)
        nll = T.nnet.categorical_crossentropy(self.p_y_given_x, idgs.ravel())
        nll = nll.reshape(idgs.shape)
        self.nll = nll
        self.mask = mask = T.cast(T.neq(idgs, self.padding_id),
                                  theano.config.floatX)
        nll = T.sum(nll * mask, axis=0)

        #layers.append(embedding_layer)
        layers.append(output_layer)
        params = []
        for l in self.layers:
            params += l.params
        self.params = params
        say("num of parameters: {}\n".format(
            sum(len(x.get_value(borrow=True).ravel()) for x in params)))

        l2_reg = None
        for p in params:
            if l2_reg is None:
                l2_reg = p.norm(2)
            else:
                l2_reg = l2_reg + p.norm(2)
        l2_reg = l2_reg * args.l2_reg
        self.loss = T.mean(nll)
        self.cost = self.loss + l2_reg
Ejemplo n.º 22
0
    def ready(self):
        global total_generate_time
        #say("in generator ready: \n")
        #start_generate_time = time.time()
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))

        # len*batch
        x = self.x = T.imatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = []
        layer_type = args.layer.lower()
        for i in xrange(2):
            if layer_type == "rcnn":
                l = RCNN(n_in=n_e,
                         n_out=n_d,
                         activation=activation,
                         order=args.order)
            elif layer_type == "lstm":
                l = LSTM(n_in=n_e, n_out=n_d, activation=activation)

            l = Layer(n_in=n_e, n_out=n_d, activation=sigmoid)

            layers.append(l)

        # len * batch
        #masks = T.cast(T.neq(x, padding_id), theano.config.floatX)
        masks = T.cast(T.neq(x, padding_id), theano.config.floatX).dimshuffle(
            (0, 1, "x"))

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)
        self.word_embs = embs

        flipped_embs = embs[::-1]

        # len*bacth*n_d
        h1 = layers[0].forward(embs)
        h2 = layers[1].forward(flipped_embs)
        h_final = T.concatenate([h1, h2[::-1]], axis=2)
        h_final = apply_dropout(h_final, dropout)
        size = n_d * 2

        #size = n_e

        output_layer = self.output_layer = Layer(n_in=size,
                                                 n_out=1,
                                                 activation=sigmoid)

        # len*batch*1
        probs = output_layer.forward(h_final)
        #probs = output_layer.forward(embs)
        #probs1 = probs.reshape(x.shape)

        #probs_rev = output_layer.forward(flipped_embs)
        #probs1_rev = probs.reshape(x.shape)

        #probs = T.concatenate([probs1, probs1_rev[::-1]], axis=2)

        # len*batch
        probs2 = probs.reshape(x.shape)
        if self.args.seed is not None:
            self.MRG_rng = MRG_RandomStreams(self.args.seed)
        else:
            self.MRG_rng = MRG_RandomStreams()
        z_pred = self.z_pred = T.cast(
            self.MRG_rng.binomial(size=probs2.shape, p=probs2),
            theano.config.floatX)  #"int8")

        # we are computing approximated gradient by sampling z;
        # so should mark sampled z not part of the gradient propagation path
        #
        z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred)
        #self.sample_updates = sample_updates
        print "z_pred", z_pred.ndim

        z2 = z_pred.dimshuffle((0, 1, "x"))
        logpz = -T.nnet.binary_crossentropy(probs, z2) * masks
        logpz = self.logpz = logpz.reshape(x.shape)
        probs = self.probs = probs.reshape(x.shape)

        # batch
        z = z_pred
        self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
        self.zdiff = T.sum(T.abs_(z[1:] - z[:-1]),
                           axis=0,
                           dtype=theano.config.floatX)

        params = self.params = []
        for l in layers + [output_layer]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost
Ejemplo n.º 23
0
    def ready(self):
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))

        # len*batch
        x = self.x = T.imatrix()

        z = self.z = T.bmatrix()
        z = z.dimshuffle((0, 1, "x"))

        # batch*nclasses
        y = self.y = T.fmatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = []
        depth = args.depth
        layer_type = args.layer.lower()
        for i in range(depth):
            if layer_type == "rcnn":
                l = ExtRCNN(n_in=n_e if i == 0 else n_d,
                            n_out=n_d,
                            activation=activation,
                            order=args.order)
            elif layer_type == "lstm":
                l = ExtLSTM(n_in=n_e if i == 0 else n_d,
                            n_out=n_d,
                            activation=activation)
            layers.append(l)

        # len * batch * 1
        masks = T.cast(
            T.neq(x, padding_id).dimshuffle((0, 1, "x")) * z,
            theano.config.floatX)
        # batch * 1
        cnt_non_padding = T.sum(masks, axis=0) + 1e-8

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)

        pooling = args.pooling
        lst_states = []
        h_prev = embs
        for l in layers:
            # len*batch*n_d
            h_next = l.forward_all(h_prev, z)
            if pooling:
                # batch * n_d
                masked_sum = T.sum(h_next * masks, axis=0)
                lst_states.append(masked_sum / cnt_non_padding)  # mean pooling
            else:
                lst_states.append(h_next[-1])  # last state
            h_prev = apply_dropout(h_next, dropout)

        if args.use_all:
            size = depth * n_d
            # batch * size (i.e. n_d*depth)
            h_final = T.concatenate(lst_states, axis=1)
        else:
            size = n_d
            h_final = lst_states[-1]
        h_final = apply_dropout(h_final, dropout)

        output_layer = self.output_layer = Layer(n_in=size,
                                                 n_out=self.nclasses,
                                                 activation=sigmoid)

        # batch * nclasses
        preds = self.preds = output_layer.forward(h_final)

        # batch
        loss_mat = self.loss_mat = (preds - y)**2
        loss = self.loss = T.mean(loss_mat)

        pred_diff = self.pred_diff = T.mean(
            T.max(preds, axis=1) - T.min(preds, axis=1))

        params = self.params = []
        for l in layers + [output_layer]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost

        cost = self.cost = loss * 10 + l2_cost
Ejemplo n.º 24
0
    def ready(self):
        args = self.args
        index = self.index = T.lscalar()
        x = self.x = T.fmatrix()
        y = self.y = T.ivector()

        dropout = self.dropout = theano.shared(np.float64(args.dropout).astype(
                        "float32"))

        n_d = args.hidden_dim
        layers = self.layers = [ ]
        for i in xrange(args.depth):
            l = Layer(
                        n_in = 28*28 if i == 0 else n_d,
                        n_out = n_d,
                        activation = ReLU
                    )
            layers.append(l)


        output_layer = self.output_layer = Layer(
                    n_in = n_d,
                    n_out = 10,
                    activation = softmax
                )

        h = x
        for l in layers:
            h = l.forward(h)
            h = apply_dropout(h, dropout)

        self.h_final = h

        # batch * 10
        probs = self.probs = output_layer.forward(h)

        # batch
        preds = self.preds = T.argmax(probs, axis=1)
        err = self.err = T.mean(T.cast(T.neq(preds, y), dtype="float32"))

        #
        loss = self.loss = -T.mean( T.log(probs[T.arange(y.shape[0]), y]) )
        #loss = self.loss = T.mean( T.nnet.categorical_crossentropy(
        #                            probs,
        #                            y
        #                    ))

        params = self.params = [ ]
        for l in layers + [ output_layer ]:
            for p in l.params:
                params.append(p)

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost += T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg

        self.l2_cost = l2_cost
        self.cost = loss + l2_cost
        print "cost.dtype", self.cost.dtype
Ejemplo n.º 25
0
    def ready(self):
        generator = self.generator
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]
        unk_id = embedding_layer.vocab_map["<unk>"]
        unk_vec = embedding_layer.embeddings[unk_id]

        dropout = generator.dropout

        # len*batch
        x = generator.x
        z = generator.z_pred
        z = z.dimshuffle((0,1,"x"))

        # batch*nclasses
        y = self.y = T.fmatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = [ ]
        depth = args.depth
        layer_type = args.layer.lower()
        for i in xrange(depth):
            l = CNN(
                    n_in = n_e if i == 0 else n_d,
                    n_out = n_d,
                    activation = activation,
                    order = args.order
                )
            layers.append(l)

        # len * batch * 1
        masks = T.cast(T.neq(x, padding_id).dimshuffle((0,1,"x")) * z, theano.config.floatX)
        # batch * 1
        cnt_non_padding = T.sum(masks, axis=0) + 1e-8

        # len*batch*n_e
        embs = generator.word_embs*z + unk_vec.dimshuffle(('x','x',0))*(1-z)

        pooling = args.pooling
        lst_states = [ ]
        h_prev = embs
        for l in layers:
            # len*batch*n_d
            h_next = l.forward_all(h_prev)
            if pooling:
                # batch * n_d
                masked_sum = T.sum(h_next * masks, axis=0)
                lst_states.append(masked_sum/cnt_non_padding) # mean pooling
            else:
                lst_states.append(T.max(h_next, axis=0))
            h_prev = apply_dropout(h_next, dropout)

        if args.use_all:
            size = depth * n_d
            # batch * size (i.e. n_d*depth)
            h_final = T.concatenate(lst_states, axis=1)
        else:
            size = n_d
            h_final = lst_states[-1]
        h_final = apply_dropout(h_final, dropout)

        output_layer = self.output_layer = Layer(
                n_in = size,
                n_out = self.nclasses,
                activation = sigmoid
            )

        # batch * nclasses
        p_y_given_x = self.p_y_given_x = output_layer.forward(h_final)
        preds = self.preds = p_y_given_x > 0.5
        print preds, preds.dtype
        print self.nclasses

        # batch
        loss_mat = T.nnet.binary_crossentropy(p_y_given_x, y)

        if args.aspect < 0:
            loss_vec = T.mean(loss_mat, axis=1)
        else:
            assert args.aspect < self.nclasses
            loss_vec = loss_mat[:,args.aspect]
        self.loss_vec = loss_vec

        self.true_pos = T.sum(preds*y)
        self.tot_pos = T.sum(preds)
        self.tot_true = T.sum(y)

        zsum = generator.zsum
        zdiff = generator.zdiff
        logpz = generator.logpz

        coherent_factor = args.sparsity * args.coherent
        loss = self.loss = T.mean(loss_vec)
        sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \
                                             T.mean(zdiff) * coherent_factor
        cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor
        cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0))
        self.obj = T.mean(cost_vec)

        params = self.params = [ ]
        for l in layers + [ output_layer ]:
            for p in l.params:
                params.append(p)
        if not args.fix_emb:
            params += embedding_layer.params
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost

        self.cost_g = cost_logpz + generator.l2_cost
        self.cost_e = loss + l2_cost
Ejemplo n.º 26
0
    def ready(self):
        args = self.args
        embedding_layer = self.embedding_layer
	num_aspects = self.num_aspects        

        self.n_emb = embedding_layer.n_d
        
	dropout = self.dropout = theano.shared(
                np.float64(args.dropout_rate).astype(theano.config.floatX)
            )

        self.x = T.imatrix('x')
	self.w_masks = T.fmatrix('mask')
	self.w_lens = T.fvector('sent_len')
	self.s_maxlen = T.iscalar('sent_max_len')
	self.s_num = T.iscalar('sent_num')
	self.y = T.ivector('y')
	self.ay = T.imatrix('ay')
	self.ay_mask = T.fmatrix('ay_mask')	
	self.aay = T.itensor3('aay')

        x = self.x
	query = self.query
        
	w_masks = self.w_masks
	w_lens = self.w_lens
	s_ml = self.s_maxlen
	s_num = self.s_num
	n_emb = self.n_emb
	
	y = self.y
        ay = self.ay
	ay_mask = self.ay_mask
	aay = self.aay

	layers = self.layers = [embedding_layer]
        slices  = embedding_layer.forward(x.ravel())
	self.slices = slices = slices.reshape( (x.shape[0], x.shape[1], n_emb) )
	
	slices_query = embedding_layer.forward(query.flatten(), is_node = False)
	slices_query = slices_query.reshape( (query.shape[0], query.shape[1], n_emb))
	
	layers.append(Query_Repr_Layer(slices_query))
	slices_query_tmp = slices_query = layers[-1].forward()
	
	layer = LSTM(n_in = n_emb, n_out = n_emb)
        layers.append(layer)

	prev_output = slices
        prev_output = apply_dropout(prev_output, dropout, v2=True)
        prev_output = layers[-1].forward_all(prev_output, w_masks)

        layer = Layer(n_in = n_emb, n_out = n_emb, activation = tanh)
        layers.append(layer)
        self.slices_query = slices_query = layers[-1].forward(slices_query)

	maskss = []
	w_lenss = []
	for i in range(num_aspects):
	    maskss.append(w_masks)
	    w_lenss.append(w_lens)

	maskss = T.concatenate(maskss, axis = 1)
        w_lenss = T.concatenate(w_lenss)

	layer = IterAttentionLayer(n_in = n_emb, n_out = n_emb)
        layers.append(layer)
	prev_output = layers[-1].forward(prev_output, slices_query, is_word = True, hop = args.hop_word, masks = w_masks, aspect_num = num_aspects)
	prev_output = prev_output.reshape((prev_output.shape[0] * prev_output.shape[1], prev_output.shape[2]))
        prev_output = apply_dropout(prev_output, dropout, v2=True)
	
	prev_output = prev_output.reshape((num_aspects, prev_output.shape[0] / (num_aspects * s_num), s_num, prev_output.shape[1]))
	prev_output = prev_output.dimshuffle(2, 0, 1, 3)
	prev_output = prev_output.reshape((prev_output.shape[0], prev_output.shape[1] * prev_output.shape[2], prev_output.shape[3]))

        layer = LSTM(n_in = n_emb * args.hop_word, n_out = n_emb)
	layers.append(layer)
	prev_output = layers[-1].forward_all(prev_output)
	
	#layers.append(Query_Repr_Layer(slices_query))
        #slices_query = layers[-1].forward()
	layer = Layer(n_in = n_emb, n_out = n_emb, activation = tanh)
        layers.append(layer)
        slices_query = layers[-1].forward(slices_query_tmp) # bug
	
	layer = IterAttentionLayer(n_in = n_emb, n_out = n_emb)
        layers.append(layer)
	prev_output = layers[-1].forward(prev_output, slices_query, is_word = False, hop = args.hop_sent, aspect_num = num_aspects)
        prev_output = prev_output.reshape((prev_output.shape[0] * prev_output.shape[1], prev_output.shape[2]))
	prev_output = apply_dropout(prev_output, dropout, v2=True)

	prev_output = prev_output.reshape((num_aspects, prev_output.shape[0] / num_aspects, prev_output.shape[1]))
	
	softmax_inputs = []
	for i in range(num_aspects):
	    softmax_inputs.append(prev_output[i])
	
	size = n_emb * args.hop_sent
	
	p_y_given_a = []
	pred_ay = []
	nll_loss_ay = []
	
	for i in range(num_aspects):
	    layers.append(Layer(n_in = size,
                    n_out = args.score_scale,
                    activation = softmax,
                    has_bias = False,))

	    p_y_given_a.append(layers[-1].forward(softmax_inputs[i]))
	    nll_loss_ay.append( T.mean(T.sum( -T.log(p_y_given_a[-1]) * aay[:, i, :] * ay_mask[:, i].dimshuffle(0, 'x'))))
	    pred_ay.append(T.argmax(p_y_given_a[-1], axis = 1))

	self.p_y_given_a = p_y_given_a
	self.nll_loss_ay = T.sum(nll_loss_ay)
	self.pred_ay = T.stack(pred_ay).dimshuffle(1, 0)
        
	for l,i in zip(layers[4:], range(len(layers[3:]))):
            say("layer {}: n_in={}\tn_out={}\n".format(
                    i, l.n_in, l.n_out
           ))
	
	self.l2_sqr = None
        self.params = [ ]
        for layer in layers:
            self.params += layer.params
        for p in self.params:
            if self.l2_sqr is None:
                self.l2_sqr = args.l2_reg * T.sum(p**2)
            else:
                self.l2_sqr += args.l2_reg * T.sum(p**2)

        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                        for x in self.params)
        say("total # parameters: {}\n".format(nparams))
Ejemplo n.º 27
0
    def ready(self):
        global total_encode_time
        #say("in encoder ready: \n")
        #start_encode_time = time.time()
        generator = self.generator
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = generator.dropout

        # len*batch
        x = generator.x
        z = generator.z_pred
        z = z.dimshuffle((0, 1, "x"))

        # batch*nclasses
        y = self.y = T.fmatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = []
        depth = args.depth
        layer_type = args.layer.lower()
        for i in xrange(depth):
            if layer_type == "rcnn":
                l = ExtRCNN(n_in=n_e if i == 0 else n_d,
                            n_out=n_d,
                            activation=activation,
                            order=args.order)
            elif layer_type == "lstm":
                l = ExtLSTM(n_in=n_e if i == 0 else n_d,
                            n_out=n_d,
                            activation=activation)
            layers.append(l)

        # len * batch * 1
        masks = T.cast(
            T.neq(x, padding_id).dimshuffle((0, 1, "x")) * z,
            theano.config.floatX)
        # batch * 1
        cnt_non_padding = T.sum(masks, axis=0) + 1e-8

        # len*batch*n_e
        embs = generator.word_embs

        pooling = args.pooling
        lst_states = []
        h_prev = embs
        for l in layers:
            # len*batch*n_d
            h_next = l.forward_all(h_prev, z)
            if pooling:
                # batch * n_d
                masked_sum = T.sum(h_next * masks, axis=0)
                lst_states.append(masked_sum / cnt_non_padding)  # mean pooling
            else:
                lst_states.append(h_next[-1])  # last state
            h_prev = apply_dropout(h_next, dropout)

        if args.use_all:
            size = depth * n_d
            # batch * size (i.e. n_d*depth)
            h_final = T.concatenate(lst_states, axis=1)
        else:
            size = n_d
            h_final = lst_states[-1]
        h_final = apply_dropout(h_final, dropout)

        output_layer = self.output_layer = Layer(n_in=size,
                                                 n_out=self.nclasses,
                                                 activation=sigmoid)

        # batch * nclasses
        preds = self.preds = output_layer.forward(h_final)

        # batch
        loss_mat = self.loss_mat = (preds - y)**2

        pred_diff = self.pred_diff = T.mean(
            T.max(preds, axis=1) - T.min(preds, axis=1))

        if args.aspect < 0:
            loss_vec = T.mean(loss_mat, axis=1)
        else:
            assert args.aspect < self.nclasses
            loss_vec = loss_mat[:, args.aspect]
        self.loss_vec = loss_vec

        zsum = generator.zsum
        zdiff = generator.zdiff
        logpz = generator.logpz

        coherent_factor = args.sparsity * args.coherent
        loss = self.loss = T.mean(loss_vec)
        sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \
                                             T.mean(zdiff) * coherent_factor
        cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor
        cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0))
        self.obj = T.mean(cost_vec)

        params = self.params = []
        for l in layers + [output_layer]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost

        self.cost_g = cost_logpz * 10 + generator.l2_cost
        self.cost_e = loss * 10 + l2_cost
Ejemplo n.º 28
0
    def ready(self, args, train):
        # len * batch
        self.idxs = T.imatrix()
        self.idys = T.imatrix()
        self.init_state = T.matrix(dtype=theano.config.floatX)

        dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX)
        self.dropout = theano.shared(dropout_prob)

        self.n_d = args["hidden_dim"]

        embedding_layer = EmbeddingLayer(
                n_d = self.n_d,
                vocab = set(w for w in train)
            )
        self.n_V = embedding_layer.n_V

        say("Vocab size: {}\tHidden dim: {}\n".format(
                self.n_V, self.n_d
            ))

        activation = get_activation_by_name(args["activation"])

        rnn_layer = LSTM(
                 n_in = self.n_d,
                 n_out = self.n_d,
                 activation = activation
            )

        output_layer = Layer(
                n_in = self.n_d,
                n_out = self.n_V,
                activation = T.nnet.softmax,
            )

        # (len*batch) * n_d
        x_flat = embedding_layer.forward(self.idxs.ravel())

        # len * batch * n_d
        x = apply_dropout(x_flat, self.dropout)
        x = x.reshape( (self.idxs.shape[0], self.idxs.shape[1], self.n_d) )

        # len * batch * (n_d+n_d)
        h = rnn_layer.forward_all(x, self.init_state, return_c=True)

        self.last_state = h[-1]
        h = h[:,:,self.n_d:]
        h = apply_dropout(h, self.dropout)

        self.p_y_given_x = output_layer.forward(h.reshape(x_flat.shape))

        idys = self.idys.ravel()
        self.nll = -T.log(self.p_y_given_x[T.arange(idys.shape[0]), idys])
        #self.nll = T.nnet.categorical_crossentropy(
        #                self.p_y_given_x,
        #                idys
        #            )

        self.layers = [ embedding_layer, rnn_layer, output_layer ]
        #self.params = [ x_flat ] + rnn_layer.params + output_layer.params
        self.params = embedding_layer.params + rnn_layer.params + output_layer.params
        self.num_params = sum(len(x.get_value(borrow=True).ravel())
                                for l in self.layers for x in l.params)
        say("# of params in total: {}\n".format(self.num_params))
Ejemplo n.º 29
0
    def ready(self):
        args = self.args
        weights = self.weights

        # len(title) * batch
        idts = self.idts = T.imatrix()

        # len(body) * batch
        idbs = self.idbs = T.imatrix()

        # num pairs * 3, or num queries * candidate size
        idps = self.idps = T.imatrix()

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))
        dropout_op = self.dropout_op = Dropout(self.dropout)

        embedding_layer = self.embedding_layer

        activation = get_activation_by_name(args.activation)
        n_d = self.n_d = args.hidden_dim
        n_e = self.n_e = embedding_layer.n_d

        if args.layer.lower() == "rcnn":
            LayerType = RCNN
        elif args.layer.lower() == "lstm":
            LayerType = LSTM
        elif args.layer.lower() == "gru":
            LayerType = GRU

        depth = self.depth = args.depth
        layers = self.layers = []
        for i in range(depth):
            if LayerType != RCNN:
                feature_layer = LayerType(n_in=n_e if i == 0 else n_d,
                                          n_out=n_d,
                                          activation=activation)
            else:
                feature_layer = LayerType(n_in=n_e if i == 0 else n_d,
                                          n_out=n_d,
                                          activation=activation,
                                          order=args.order,
                                          mode=args.mode,
                                          has_outgate=args.outgate)
            layers.append(feature_layer)

        # feature computation starts here

        # (len*batch)*n_e
        xt = embedding_layer.forward(idts.ravel())
        if weights is not None:
            xt_w = weights[idts.ravel()].dimshuffle((0, 'x'))
            xt = xt * xt_w

        # len*batch*n_e
        xt = xt.reshape((idts.shape[0], idts.shape[1], n_e))
        xt = apply_dropout(xt, dropout)

        # (len*batch)*n_e
        xb = embedding_layer.forward(idbs.ravel())
        if weights is not None:
            xb_w = weights[idbs.ravel()].dimshuffle((0, 'x'))
            xb = xb * xb_w

        # len*batch*n_e
        xb = xb.reshape((idbs.shape[0], idbs.shape[1], n_e))
        xb = apply_dropout(xb, dropout)

        prev_ht = self.xt = xt
        prev_hb = self.xb = xb
        for i in range(depth):
            # len*batch*n_d
            ht = layers[i].forward_all(prev_ht)
            hb = layers[i].forward_all(prev_hb)
            prev_ht = ht
            prev_hb = hb

        # normalize vectors
        if args.normalize:
            ht = self.normalize_3d(ht)
            hb = self.normalize_3d(hb)
            say("h_title dtype: {}\n".format(ht.dtype))

        self.ht = ht
        self.hb = hb

        # average over length, ignore paddings
        # batch * d
        if args.average:
            ht = self.average_without_padding(ht, idts)
            hb = self.average_without_padding(hb, idbs)
        else:
            ht = ht[-1]
            hb = hb[-1]
        say("h_avg_title dtype: {}\n".format(ht.dtype))

        # batch * d
        h_final = (ht + hb) * 0.5
        h_final = apply_dropout(h_final, dropout)
        h_final = self.normalize_2d(h_final)
        self.h_final = h_final
        say("h_final dtype: {}\n".format(ht.dtype))

        # For testing:
        #   first one in batch is query, the rest are candidate questions
        self.scores = T.dot(h_final[1:], h_final[0])

        # For training:
        xp = h_final[idps.ravel()]
        xp = xp.reshape((idps.shape[0], idps.shape[1], n_d))
        # num query * n_d
        query_vecs = xp[:, 0, :]
        # num query
        pos_scores = T.sum(query_vecs * xp[:, 1, :], axis=1)
        # num query * candidate size
        neg_scores = T.sum(query_vecs.dimshuffle((0, 'x', 1)) * xp[:, 2:, :],
                           axis=2)
        # num query
        neg_scores = T.max(neg_scores, axis=1)
        diff = neg_scores - pos_scores + 1.0
        loss = T.mean((diff > 0) * diff)
        self.loss = loss

        params = []
        for l in self.layers:
            params += l.params
        self.params = params
        say("num of parameters: {}\n".format(
            sum(len(x.get_value(borrow=True).ravel()) for x in params)))

        l2_reg = None
        for p in params:
            if l2_reg is None:
                l2_reg = p.norm(2)
            else:
                l2_reg = l2_reg + p.norm(2)
        l2_reg = l2_reg * args.l2_reg
        self.cost = self.loss + l2_reg
Ejemplo n.º 30
0
    def ready(self):
        generator = self.generator
        args = self.args
        weights = self.weights

        dropout = generator.dropout

        # len(text) * batch
        idts = generator.x
        z = generator.z_pred
        z = z.dimshuffle((0,1,"x"))

        # batch * 2
        pairs = self.pairs = T.imatrix()

        # num pairs * 3, or num queries * candidate size
        triples = self.triples = T.imatrix()

        embedding_layer = self.embedding_layer

        activation = get_activation_by_name(args.activation)
        n_d = self.n_d = args.hidden_dim
        n_e = self.n_e = embedding_layer.n_d

        if args.layer.lower() == "rcnn":
            LayerType = RCNN
            LayerType2 = ExtRCNN
        elif args.layer.lower() == "lstm":
            LayerType = LSTM
            LayerType2 = ExtLSTM
        #elif args.layer.lower() == "gru":
        #    LayerType = GRU

        depth = self.depth = args.depth
        layers = self.layers = [ ]
        for i in range(depth):
            if LayerType != RCNN:
                feature_layer = LayerType(
                        n_in = n_e if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation
                    )
            else:
                feature_layer = LayerType(
                        n_in = n_e if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation,
                        order = args.order,
                        mode = args.mode,
                        has_outgate = args.outgate
                    )
            layers.append(feature_layer)

        extlayers = self.extlayers = [ ]
        for i in range(depth):
            if LayerType != RCNN:
                feature_layer = LayerType2(
                        n_in = n_e if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation
                    )
            else:
                feature_layer = LayerType2(
                        n_in = n_e if i == 0 else n_d,
                        n_out = n_d,
                        activation = activation,
                        order = args.order,
                        mode = args.mode,
                        has_outgate = args.outgate
                    )
            feature_layer.copy_params(layers[i])
            extlayers.append(feature_layer)


        # feature computation starts here

        xt = generator.word_embs

        # encode full text into representation
        prev_ht = self.xt = xt
        for i in range(depth):
            # len*batch*n_d
            ht = layers[i].forward_all(prev_ht)
            prev_ht = ht

        # encode selected text into representation
        prev_htz = self.xt = xt
        for i in range(depth):
            # len*batch*n_d
            htz = extlayers[i].forward_all(prev_htz, z)
            prev_htz = htz

        # normalize vectors
        if args.normalize:
            ht = self.normalize_3d(ht)
            htz = self.normalize_3d(htz)
            say("h_title dtype: {}\n".format(ht.dtype))

        self.ht = ht
        self.htz = htz

        # average over length, ignore paddings
        # batch * d
        if args.average:
            ht = self.average_without_padding(ht, idts)
            htz = self.average_without_padding(htz, idts, z)
        else:
            ht = ht[-1]
            htz = htz[-1]
        say("h_avg_title dtype: {}\n".format(ht.dtype))

        # batch * d
        h_final = apply_dropout(ht, dropout)
        h_final = self.normalize_2d(h_final)
        hz_final = apply_dropout(htz, dropout)
        hz_final = self.normalize_2d(hz_final)
        self.h_final = h_final
        self.hz_final = hz_final

        say("h_final dtype: {}\n".format(ht.dtype))

        # For testing:
        #   first one in batch is query, the rest are candidate questions
        self.scores = T.dot(h_final[1:], h_final[0])
        self.scores_z = T.dot(hz_final[1:], hz_final[0])

        # For training encoder:
        xp = h_final[triples.ravel()]
        xp = xp.reshape((triples.shape[0], triples.shape[1], n_d))
        # num query * n_d
        query_vecs = xp[:,0,:]
        # num query
        pos_scores = T.sum(query_vecs*xp[:,1,:], axis=1)
        # num query * candidate size
        neg_scores = T.sum(query_vecs.dimshuffle((0,'x',1))*xp[:,2:,:], axis=2)
        # num query
        neg_scores = T.max(neg_scores, axis=1)
        diff = neg_scores - pos_scores + 1.0
        hinge_loss = T.mean( (diff>0)*diff )

        # For training generator

        # batch
        self_cosine_distance = 1.0 - T.sum(hz_final * h_final, axis=1)
        pair_cosine_distance = 1.0 - T.sum(hz_final * h_final[pairs[:,1]], axis=1)
        alpha = args.alpha
        loss_vec = self_cosine_distance*alpha + pair_cosine_distance*(1-alpha)
        #loss_vec = self_cosine_distance*0.2 + pair_cosine_distance*0.8

        zsum = generator.zsum
        zdiff = generator.zdiff
        logpz = generator.logpz

        sfactor = args.sparsity
        cfactor = args.sparsity * args.coherent
        scost_vec = zsum*sfactor + zdiff*cfactor

        # batch
        cost_vec = loss_vec + scost_vec
        cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0))
        loss = self.loss = T.mean(loss_vec)
        sparsity_cost = self.sparsity_cost = T.mean(scost_vec)
        self.obj =  loss + sparsity_cost

        params = [ ]
        for l in self.layers:
            params += l.params
        self.params = params
        say("num of parameters: {}\n".format(
            sum(len(x.get_value(borrow=True).ravel()) for x in params)
        ))

        l2_reg = None
        for p in params:
            if l2_reg is None:
                l2_reg = T.sum(p**2) #p.norm(2)
            else:
                l2_reg = l2_reg + T.sum(p**2) #p.norm(2)
        l2_reg = l2_reg * args.l2_reg
        self.l2_cost = l2_reg

        beta = args.beta
        self.cost_g = cost_logpz + generator.l2_cost
        self.cost_e = hinge_loss + loss*beta + l2_reg
        print "cost dtype", self.cost_g.dtype, self.cost_e.dtype