Ejemplo n.º 1
0
    def __init__(self, n_in, n_out, fc_in, fc_out, sample=False):
        super(Sampler, self).__init__(n_in=n_in, n_out=n_out)
        if sample:
            self.MRG_rng = MRG_RandomStreams()

        self.fc_layer = Layer(n_in=fc_in,
                              n_out=fc_out,
                              activation=get_activation_by_name('relu'),
                              has_bias=True)

        self.fc_layer_final = Layer(
            n_in=fc_out,
            n_out=1,
            activation=get_activation_by_name('sigmoid'),
            has_bias=True,
            clip_inp=True)
Ejemplo n.º 2
0
    def ready(self):

        embedding_layer = self.embedding_layer

        args = self.args
        self.padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))

        # inp_len x batch
        x = self.x = T.imatrix('x')
        fw_mask = self.fw_mask = T.imatrix('fw')
        chunk_sizes = self.chunk_sizes = T.imatrix('sizes')
        self.bm = T.imatrix('bm')
        self.posit_x = T.imatrix('pos')

        rv_mask = T.concatenate([T.ones((1, fw_mask.shape[1])), fw_mask[:-1]],
                                axis=0)
        self.z_totals = T.sum(T.neq(self.x, self.padding_id),
                              axis=0,
                              dtype=theano.config.floatX)
        self.layers = []
        self.params = []

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        self.pad_mask = T.cast(T.neq(x, self.padding_id), 'int32')
        self.chunk_mask = T.cast(T.neq(chunk_sizes, 0), 'int32')

        embs = embedding_layer.forward(x.ravel())

        self.word_embs = embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        self.embs = apply_dropout(embs, dropout)

        if args.generator_encoding == 'cnn':
            h_final, size = self.cnn_encoding(chunk_sizes, rv_mask, n_e,
                                              n_d / 2)
        else:
            h_final, size = self.lstm_encoding(fw_mask, rv_mask, n_e, n_d,
                                               activation)

        self.size = size
        self.h_final = apply_dropout(h_final, dropout)
Ejemplo n.º 3
0
    def ready(self):
        embedding_layer = self.embedding_layer
        args = self.args
        self.padding_id = padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))

        # inp_len x batch
        x = self.x = T.imatrix('x')

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = []

        for i in xrange(2):
            if args.layer == 'lstm':
                l = LSTM(
                    n_in=n_e,
                    n_out=n_d,
                    activation=activation,
                )
            else:
                l = RCNN(n_in=n_e,
                         n_out=n_d,
                         activation=activation,
                         order=args.order)
            layers.append(l)

        self.masks = masks = T.cast(T.neq(x, padding_id), theano.config.floatX)

        embs = embedding_layer.forward(x.ravel())

        self.word_embs = embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)

        flipped_embs = embs[::-1]

        h1 = layers[0].forward_all(embs)
        h2 = layers[1].forward_all(flipped_embs)
        h_final = T.concatenate([h1, h2[::-1]], axis=2)
        self.h_final = h_final = apply_dropout(h_final, dropout)
        size = n_d * 2

        output_layer = self.output_layer = ZLayer(
            n_in=size,
            n_hidden=args.hidden_dimension2,
            activation=activation,
            layer='rcnn',
        )

        z_pred, sample_updates = output_layer.sample_all(h_final)
        self.non_sampled_zpred, _ = output_layer.sample_all_pretrain(h_final)

        z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred)
        self.sample_updates = sample_updates

        probs = output_layer.forward_all(h_final, z_pred)

        logpz = -T.nnet.binary_crossentropy(probs, z_pred) * masks
        logpz = self.logpz = logpz.reshape(x.shape)
        probs = self.probs = probs.reshape(x.shape) * masks

        # batch
        z = z_pred
        self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
        self.zdiff = T.sum(T.abs_(z[1:] - z[:-1]),
                           axis=0,
                           dtype=theano.config.floatX)

        params = self.params = []
        for l in layers + [output_layer] + [embedding_layer]:
            for p in l.params:
                params.append(p)

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost
Ejemplo n.º 4
0
    def ready(self):
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        layers = []
        params = self.params = []

        # hl_inp_len x (batch * n)
        y = self.y = T.imatrix('y')
        # (batch * n) x n_classes
        gold_standard_entities = self.gold_standard_entities = T.ivector('gs')
        loss_mask = self.loss_mask = T.ivector('loss_mask')

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))

        mask_y = T.cast(T.neq(y, padding_id), theano.config.floatX).dimshuffle(
            (0, 1, 'x'))

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d

        embs_y = embedding_layer.forward(y.ravel())
        embs_y = embs_y.reshape((y.shape[0], y.shape[1], n_e))

        flipped_embs_y = embs_y[::-1]
        flipped_mask_y = mask_y[::-1]

        rnn_fw = MaskedLSTM(n_in=n_e, n_out=n_d)

        rnn_rv = MaskedLSTM(n_in=n_e, n_out=n_d)

        h_f_y = rnn_fw.forward_all_hl(embs_y, mask_y)
        h_r_y = rnn_rv.forward_all_hl(flipped_embs_y, flipped_mask_y)

        # 1 x (batch * n) x n_d -> (batch * n) x (2 * n_d) x 1
        h_concat_y = T.concatenate([h_f_y, h_r_y], axis=2).dimshuffle(
            (1, 2, 0))

        layers.append(rnn_fw)
        layers.append(rnn_rv)

        if not args.qa_hl_only:
            self.x = x = T.imatrix('x')
            mask_x = T.cast(T.neq(x, padding_id),
                            theano.config.floatX).dimshuffle((0, 1, 'x'))
            tiled_x_mask = T.tile(mask_x, (args.n, 1)).dimshuffle((1, 0, 2))

            embs = embedding_layer.forward(x.ravel())

            embs = embs.reshape((x.shape[0], x.shape[1], n_e))
            self.embs = embs = apply_dropout(embs, dropout)

            flipped_embs_x = embs[::-1]
            flipped_mask_x = mask_x[::-1]

            h_f_x = rnn_fw.forward_all_doc(embs, mask_x)
            h_r_x = rnn_rv.forward_all_doc(flipped_embs_x, flipped_mask_x)

            h_concat_x = T.concatenate([h_f_x, h_r_x[::-1]], axis=2)

            softmax_mask = T.zeros_like(tiled_x_mask) - 1e8
            self.softmax_mask = softmax_mask = softmax_mask * (tiled_x_mask -
                                                               1)

            # inp_len x batch x n_d -> inp_len x batch x (2 * n_d)
            # (batch * n) x inp_len x (2 * n_d)
            gen_h_final = T.tile(h_concat_x, (args.n, 1)).dimshuffle((1, 0, 2))

            if args.bilinear:
                bilinear_l = Bilinear(n_d, x.shape[1], args.n)
                inp_dot_hl = bilinear_l.forward(gen_h_final, h_concat_y)

                layers.append(bilinear_l)
            else:
                # (batch * n) x inp_len x 1
                inp_dot_hl = T.batched_dot(gen_h_final, h_concat_y)

            h_size = n_d * 2

            inp_dot_hl = inp_dot_hl - softmax_mask
            inp_dot_hl = inp_dot_hl.ravel()

            # (batch * n) x inp_len
            self.alpha = alpha = T.nnet.softmax(
                inp_dot_hl.reshape((args.n * x.shape[1], x.shape[0])))

            # (batch * n) x n_d * 2
            o = T.batched_dot(alpha, gen_h_final)

            output_size = h_size * 4
            h_concat_y = h_concat_y.reshape((o.shape[0], o.shape[1]))
            self.o = o = T.concatenate(
                [o, h_concat_y,
                 T.abs_(o - h_concat_y), o * h_concat_y],
                axis=1)
        else:
            h_concat_y = h_concat_y.reshape((y.shape[1], n_d * 2))
            self.o = o = h_concat_y
            output_size = n_d * 2

        fc7 = Layer(n_in=output_size,
                    n_out=512,
                    activation=get_activation_by_name('relu'),
                    has_bias=True)
        fc7_out = fc7.forward(o)

        output_layer = Layer(n_in=512,
                             n_out=self.nclasses,
                             activation=softmax,
                             has_bias=True)

        layers.append(fc7)
        layers.append(output_layer)

        preds = output_layer.forward(fc7_out)
        self.preds_clipped = preds_clipped = T.clip(preds, 1e-7, 1.0 - 1e-7)

        cross_entropy = T.nnet.categorical_crossentropy(
            preds_clipped, gold_standard_entities) * loss_mask

        loss = self.loss = T.mean(cross_entropy)

        for l in layers + [embedding_layer]:
            for p in l.params:
                params.append(p)

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)

        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost

        self.cost_e = loss + l2_cost
Ejemplo n.º 5
0
    def ready(self):
        generator = self.generator
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        layers = []
        params = self.params = []

        # hl_inp_len x (batch * n)
        y = self.y = T.imatrix('y')
        # (batch * n) x n_classes
        gold_standard_entities = self.gold_standard_entities = T.ivector('gs')
        # inp_len x batch
        bm = self.bm = generator.bm

        loss_mask = self.loss_mask = T.ivector('loss_mask')

        # inp_len x batch
        x = generator.x
        z = generator.z_pred

        mask_y = T.cast(T.neq(y, padding_id), theano.config.floatX).dimshuffle(
            (0, 1, 'x'))

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d

        embs_y = embedding_layer.forward(y.ravel())
        embs_y = embs_y.reshape((y.shape[0], y.shape[1], n_e))

        flipped_embs_y = embs_y[::-1]
        flipped_mask_y = mask_y[::-1]

        rnn_fw = MaskedLSTM(n_in=n_e, n_out=n_d)

        rnn_rv = MaskedLSTM(n_in=n_e, n_out=n_d)

        h_f_y = rnn_fw.forward_all_hl(embs_y, mask_y)
        h_r_y = rnn_rv.forward_all_hl(flipped_embs_y, flipped_mask_y)

        layers.append(rnn_fw)
        layers.append(rnn_rv)

        mask_x = T.cast(T.neq(x, padding_id) * z,
                        theano.config.floatX).dimshuffle((0, 1, 'x'))
        tiled_x_mask = T.tile(mask_x, (args.n, 1)).dimshuffle((1, 0, 2))

        if args.use_generator_h:
            h_concat_x = self.generator.word_level_h

            if args.generator_encoding == 'cnn':
                layers.extend(self.generator.layers[:4])
            else:
                layers.extend(self.generator.layers[:2])

        else:
            embs_x = generator.word_embs

            flipped_embs_x = embs_x[::-1]
            flipped_mask_x = mask_x[::-1]

            h_f_x = rnn_fw.forward_all_doc(embs_x, mask_x)
            h_r_x = rnn_rv.forward_all_doc(flipped_embs_x, flipped_mask_x)

            h_concat_x = T.concatenate([h_f_x, h_r_x[::-1]], axis=2)

        softmax_mask = T.zeros_like(tiled_x_mask) - 1e8
        self.softmax_mask = softmax_mask = softmax_mask * (tiled_x_mask - 1)

        # 1 x (batch * n) x n_d -> (batch * n) x (2 * n_d) x 1
        h_concat_y = T.concatenate([h_f_y, h_r_y], axis=2).dimshuffle(
            (1, 2, 0))

        # inp_len x batch x n_d -> inp_len x batch x (2 * n_d)
        # (batch * n) x inp_len x (2 * n_d)
        gen_h_final = T.tile(h_concat_x, (args.n, 1)).dimshuffle((1, 0, 2))

        if args.bilinear:
            bilinear_l = Bilinear(n_d, x.shape[1], args.n)
            inp_dot_hl = bilinear_l.forward(gen_h_final, h_concat_y)

            layers.append(bilinear_l)
        else:
            # (batch * n) x inp_len x 1
            inp_dot_hl = T.batched_dot(gen_h_final, h_concat_y)

        h_size = n_d * 2

        inp_dot_hl = inp_dot_hl - softmax_mask
        inp_dot_hl = inp_dot_hl.ravel()

        # (batch * n) x inp_len
        self.alpha = alpha = T.nnet.softmax(
            inp_dot_hl.reshape((args.n * x.shape[1], x.shape[0])))

        # (batch * n) x n_d * 2
        o = T.batched_dot(alpha, gen_h_final)

        output_size = h_size * 4
        h_concat_y = h_concat_y.reshape((o.shape[0], o.shape[1]))
        self.o = o = T.concatenate(
            [o, h_concat_y,
             T.abs_(o - h_concat_y), o * h_concat_y], axis=1)

        fc7 = Layer(n_in=output_size,
                    n_out=512,
                    activation=get_activation_by_name('relu'),
                    has_bias=True)
        fc7_out = fc7.forward(o)

        output_layer = Layer(n_in=512,
                             n_out=self.nclasses,
                             activation=softmax,
                             has_bias=True)

        layers.append(fc7)
        layers.append(output_layer)

        preds = output_layer.forward(fc7_out)
        self.preds_clipped = preds_clipped = T.clip(preds, 1e-7, 1.0 - 1e-7)

        cross_entropy = T.nnet.categorical_crossentropy(
            preds_clipped, gold_standard_entities) * loss_mask
        loss_mat = cross_entropy.reshape((x.shape[1], args.n))

        word_ol = z * bm

        total_z_word_overlap_per_sample = T.sum(word_ol, axis=0)
        total_overlap_per_sample = T.sum(bm, axis=0) + args.bigram_smoothing

        self.word_overlap_loss = word_overlap_loss = total_z_word_overlap_per_sample / total_overlap_per_sample

        self.loss_vec = loss_vec = T.mean(loss_mat, axis=1)

        logpz = generator.logpz

        loss = self.loss = T.mean(cross_entropy)

        z_totals = T.sum(T.neq(x, padding_id),
                         axis=0,
                         dtype=theano.config.floatX)
        self.zsum = zsum = T.abs_(generator.zsum / z_totals - args.z_perc)
        self.zdiff = zdiff = generator.zdiff / z_totals

        self.cost_vec = cost_vec = loss_vec + args.coeff_adequacy * (
            1 - word_overlap_loss) + args.coeff_z * (2 * zsum + zdiff)

        self.logpz = logpz = T.sum(logpz, axis=0)
        self.cost_logpz = cost_logpz = T.mean(cost_vec * logpz)
        self.obj = T.mean(cost_vec)

        for l in layers + [embedding_layer]:
            for p in l.params:
                params.append(p)

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)

        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost

        self.cost_g = cost_logpz * args.coeff_cost_scale + generator.l2_cost
        self.cost_e = loss + l2_cost
Ejemplo n.º 6
0
    def ready(self):
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))

        # len*batch
        x = self.x = T.imatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = []
        layer_type = args.layer.lower()
        for i in xrange(2):
            if layer_type == "rcnn":
                l = RCNN(n_in=n_e,
                         n_out=n_d,
                         activation=activation,
                         order=args.order)
            elif layer_type == "lstm":
                l = LSTM(n_in=n_e, n_out=n_d, activation=activation)
            layers.append(l)

        # len * batch
        self.masks = T.cast(T.neq(x, padding_id), theano.config.floatX)

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)
        self.word_embs = embs

        flipped_embs = embs[::-1]

        # len*batch*n_d
        h1 = layers[0].forward_all(embs)
        h2 = layers[1].forward_all(flipped_embs)

        h_final = T.concatenate([h1, h2[::-1]], axis=2)
        h_final = apply_dropout(h_final, dropout)
        size = n_d * 2

        h1_sent = h1[args.sentence_length - 1::args.sentence_length]
        h2_sent = h2[args.sentence_length - 1::args.sentence_length]
        # h_final_sent = T.concatenate([h1_sent, h2_sent[::-1]], axis=2)
        # h_final_sent = apply_dropout(h_final_sent, dropout)

        output_layer = self.output_layer = ZLayer(
            n_in=size, n_hidden=args.hidden_dimension2, activation=activation)

        # sample z given text (i.e. x)
        z_pred, sample_updates = output_layer.sample_all(h_final)

        # we are computing approximated gradient by sampling z;
        # so should mark sampled z not part of the gradient propagation path
        #
        z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred)
        self.sample_updates = sample_updates
        print "z_pred", z_pred.ndim

        probs_word = output_layer.forward_all(h_final, z_pred)

        # SENTENCE LEVEL

        # output_layer_sent = self.output_layer_sent = ZLayer(
        #     n_in=size,
        #     n_hidden=args.hidden_dimension2,
        #     activation=activation
        # )
        #
        # z_pred_sent, sample_updates_sent = output_layer_sent.sample_all(h_final_sent)
        #
        # z_pred_sent = self.z_pred_sent = theano.gradient.disconnected_grad(z_pred_sent)
        # self.sample_updates_sent = sample_updates_sent
        #
        # probs_sent = output_layer_sent.forward_all(h_final_sent, z_pred_sent)
        #
        # z_pred_sent = T.repeat(z_pred_sent, args.sentence_length, axis=0)
        self.z_pred_combined = z_pred

        # probs_sent = T.repeat(probs_sent, args.sentence_length, axis=0)
        probs = probs_word

        logpz = -T.nnet.binary_crossentropy(probs,
                                            self.z_pred_combined) * self.masks
        logpz = self.logpz = logpz.reshape(x.shape)
        probs = self.probs = probs.reshape(x.shape)

        # batch
        z = self.z_pred_combined
        self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
        self.zdiff = T.sum(T.abs_(z[1:] - z[:-1]),
                           axis=0,
                           dtype=theano.config.floatX)

        params = self.params = []
        for l in layers + [output_layer]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                      for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost
Ejemplo n.º 7
0
    def ready(self):
        generator = self.generator
        embedding_layer = self.embedding_layer
        embedding_layer_y = self.embedding_layer_y

        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = generator.dropout

        # len*batch
        y = self.y = T.imatrix()
        y_mask = T.cast(T.neq(y, padding_id), theano.config.floatX)

        bv = self.bv = T.imatrix()

        z = generator.z_pred_combined
        z = z.dimshuffle((0, 1, "x"))
        y_mask = y_mask.dimshuffle((0, 1, "x"))

        # batch*nclasses
        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        # (len*batch)*n_e
        embs = generator.word_embs
        # (gs_len*batch)*n_e
        embs_y = embedding_layer_y.forward(y.ravel())
        embs_y = embs_y.reshape((y.shape[0], y.shape[1], n_e))

        l = ExtRCNN(n_in=n_e,
                    n_out=n_d,
                    activation=activation,
                    order=args.order)

        h_prev = embs
        h_prev_y = embs_y
        # len*batch*n_d
        h_next_y = l.forward_all_2(h_prev_y, y_mask)
        h_next_y = theano.gradient.disconnected_grad(h_next_y)

        h_next = l.forward_all(h_prev, z)

        h_next = h_next[::args.sentence_length]
        h_final_y = h_next_y[::args.sentence_length_hl]

        h_final = apply_dropout(h_next, dropout)

        h_final_y = h_final_y.dimshuffle(1, 0, 2)  # 15 x 4 x 200
        h_final = h_final.dimshuffle(1, 0, 2)  # 15 x 10 x 200

        h_final_y_r = (h_final_y**2).sum(2, keepdims=True)  # 15 x 4 x 1
        h_final_r = (h_final**2).sum(2, keepdims=True).dimshuffle(
            0, 2, 1)  # 15 x 1 x 10

        batched_dot = T.batched_dot(h_final_y,
                                    h_final.dimshuffle(0, 2, 1))  # 15 x 4 x 10

        squared_euclidean_distances = h_final_y_r + h_final_r - 2 * batched_dot  # (15 x 4 x 1 + 15 x 1 x 10) +  (15 x 4 x 10)
        similarity = T.sqrt(squared_euclidean_distances).dimshuffle(
            1, 0, 2)  # 4 x 15 x 10

        loss_mat = self.loss_mat = T.min(similarity, axis=2,
                                         keepdims=True)  # 4 x 15 x 1

        self.loss_vec = loss_vec = T.mean(loss_mat, axis=0)

        zsum = generator.zsum
        zdiff = generator.zdiff
        logpz = generator.logpz

        padded = T.shape_padaxis(T.zeros_like(bv[0]), axis=1).dimshuffle(
            (1, 0))
        component_2 = T.concatenate([bv[1:], padded], axis=0)

        # component_2 = T.stack([shifted_bv, bv], axis=2)
        self.bigram_overlap = component_2 * bv

        intersection = T.sum(self.bigram_overlap)
        jac = (intersection +
               args.jaccard_smoothing) / (T.sum(bv) + args.jaccard_smoothing)
        jac = 1 - jac

        coherent_factor = args.sparsity * args.coherent
        loss = self.loss = T.mean(loss_vec)
        self.sparsity_cost = T.mean(zsum) * args.sparsity + \
                             T.mean(zdiff) * coherent_factor

        samp = zsum * args.sparsity + zdiff * coherent_factor
        cost_vec = samp + loss_vec + jac
        cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0))

        self.obj = T.mean(cost_vec) + jac
        self.encoder_params = l.params

        params = self.params = []

        for p in l.params:
            params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                      for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost

        self.cost_g = cost_logpz * 10 + generator.l2_cost
        self.cost_e = loss * 10 + l2_cost