Example #1
0
def transformer(
        src_vocab_size,
        trg_vocab_size,
        max_length,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        dropout_rate,
        label_smooth_eps, ):
    enc_inputs = make_all_inputs(encoder_data_input_fields +
                                 encoder_util_input_fields)

    enc_output = wrap_encoder(
        src_vocab_size,
        max_length,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        dropout_rate,
        enc_inputs, )
    dec_inputs = make_all_inputs(decoder_data_input_fields[:-1] +
                                 decoder_util_input_fields)

    predict = wrap_decoder(
        trg_vocab_size,
        max_length,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        dropout_rate,
        dec_inputs,
        enc_output, )

    # Padding index do not contribute to the total loss. The weights is used to
    # cancel padding index in calculating the loss.
    label, weights = make_all_inputs(label_data_input_fields)
    if label_smooth_eps:
        label = layers.label_smooth(
            label=layers.one_hot(
                input=label, depth=trg_vocab_size),
            epsilon=label_smooth_eps)
    cost = layers.softmax_with_cross_entropy(
        logits=predict,
        label=label,
        soft_label=True if label_smooth_eps else False)
    # cost = layers.softmax_with_cross_entropy(logits=predict, label=gold)
    weighted_cost = cost * weights
    sum_cost = layers.reduce_sum(weighted_cost)
    token_num = layers.reduce_sum(weights)
    avg_cost = sum_cost / token_num
    return sum_cost, avg_cost, predict, token_num
Example #2
0
    def forward(self, enc_inputs, dec_inputs, label, weights):
        """
        forward
        :param enc_inputs:
        :param dec_inputs:
        :param label:
        :param weights:
        :return:
        """
        enc_output = self._wrap_encoder_layer(enc_inputs)
        predict = self._wrap_decoder_layer(dec_inputs, enc_output)
        if self._label_smooth_eps:
            label_out = layers.label_smooth(label=layers.one_hot(
                input=label, depth=self._trg_vocab_size),
                                            epsilon=self._label_smooth_eps)

        cost = layers.softmax_with_cross_entropy(
            logits=predict,
            label=label_out,
            soft_label=True if self._label_smooth_eps else False)
        weighted_cost = cost * weights
        sum_cost = layers.reduce_sum(weighted_cost)
        token_num = layers.reduce_sum(weights)
        token_num.stop_gradient = True
        avg_cost = sum_cost / token_num
        return sum_cost, avg_cost, predict, token_num
Example #3
0
    def forward(self, inputs, is_infer=False):
        """
        Run model main forward.
        """
        outputs = {}
        if is_infer:
            self.generation_caches = [{
                "k":
                layers.fill_constant_batch_size_like(
                    input=inputs["token_ids"],
                    shape=[-1, 0, self.d_key * self.n_head],
                    dtype=self.dtype,
                    value=0),
                "v":
                layers.fill_constant_batch_size_like(
                    input=inputs["token_ids"],
                    shape=[-1, 0, self.d_value * self.n_head],
                    dtype=self.dtype,
                    value=0),
            } for i in range(self.n_layer)]
        else:
            self.generation_caches = None

        latent_embeddings = layers.create_parameter(
            shape=[self.emb_size, self.latent_type_size],
            dtype=self.dtype,
            attr=fluid.ParamAttr(name=self.latent_emb_name,
                                 initializer=self.param_initializer))

        if is_infer:
            latent_id = inputs["latent_id"]
            weights = layers.one_hot(latent_id, self.latent_type_size)
        else:
            logits, recognition_checkpoints = self._recognition_network(
                token_ids=inputs["token_ids"],
                type_ids=inputs["type_ids"],
                pos_ids=inputs["pos_ids"],
                role_ids=inputs.get("role_ids", None),
                recognition_mask=inputs["recognition_mask"],
            )
            outputs["post_probs"] = layers.softmax(logits)
            weights = self._gumbel_softmax(logits)
            outputs["checkpoints"] = recognition_checkpoints

        latent_emb = layers.matmul(x=weights,
                                   y=latent_embeddings,
                                   transpose_y=True)
        outputs["enc_out"], generation_checkpoints = self._generation_network(
            token_ids=inputs["token_ids"],
            type_ids=inputs["type_ids"],
            pos_ids=inputs["pos_ids"],
            role_ids=inputs.get("role_ids", None),
            generation_mask=inputs["generation_mask"],
            aux_emb=layers.unsqueeze(latent_emb, axes=[1]),
            gather_idx=inputs.get("parent_idx", None),
        )

        if not is_infer:
            outputs["checkpoints"].extend(generation_checkpoints)
        return outputs
Example #4
0
def std_gen_interpolate(batch_size=8, seed=None, out_path='data/out',
                        levels=None, interpolate_mode=0):
    default_levels = ("y;z0;z11;z12;z21;z22;z31;z32;z41;z42;z51;z52;z61;z62")
    if levels is None:
        levels = default_levels
    default_levels = default_levels.split(';')

    img_save_dir = os.path.join('/tmp', out_path+'.dir')
    os.system(f'rm -rf {img_save_dir}')
    os.system(f'mkdir {img_save_dir} -p')

    with dg.no_grad():
        model_cache.train_mode = False
        model_cache.initialized = False
        if seed is not None:
            rds.rng = np.random.RandomState(seed)
        elif rds.rng is None:
            rds.rng = np.random
        G = model_cache.G
        x_np = rds.rng.randn(batch_size,140).astype('float32')
        y_np = rds.rng.randint(0,1000,size=[batch_size]).astype('int64')
        x = dg.to_variable(x_np)
        y_cls = dg.to_variable(y_np)
        y_hot = layers.one_hot(layers.unsqueeze(y_cls,[1]), depth=1000)
        y_embed = G.embed_y(y_hot)
        x = layers.concat([x, x[:1]], 0)
        y_embed = layers.concat([y_embed, y_embed[:1]], 0)
        levels = levels.split(';')
        for level in default_levels:
            if len(level) == 1:
                locals()[level] = y_embed
                locals()['_'+level] = y_embed[:1]
            if len(level) >= 2:
                idx = int(level[1])*20
                locals()[level] = x[:,idx:idx+20]
                locals()['_'+level] = x[:1,idx:idx+20]
        imgs = []
        for i in range(batch_size):
            for j in range(40):
                alpha = j / 40
                if interpolate_mode == 1:
                    alpha = alpha**2 * (3 - 2 * alpha)
                for level in levels:
                    locals()['_'+level] = (1 - alpha) *  locals()[level][i:i+1] + alpha * locals()[level][i+1:i+2]
                inputs = []
                for level in default_levels[1:]:
                    inputs.append(locals()['_'+level])
                img_pd = G(inputs, locals()['_'+default_levels[0]], True)
                img = np.uint8(img_pd.numpy().clip(0,1)*255)[0].transpose([1,2,0])
                imgs.append(Image.fromarray(img))
                stdout.write(f'{i*40+j+1}/{40*batch_size}\r')
                stdout.flush()
        print('')
        for i, img in enumerate(imgs):
            img.save(os.path.join(img_save_dir, str(i).zfill(5)+'.png'))
        imgs[0].save(out_path+'.gif', save_all=True, append_images=imgs[1:], duration=40, loop=0)
        out_path = out_path + '.mp4'
        os.system(f'ffmpeg -r 40 -i {img_save_dir}/%05d.png -hide_banner -loglevel warning -nostats -c:v libx264 -crf 23 -y {out_path}')
        os.system(f'rm -rf {img_save_dir}')
    def _collect_metrics(self, inputs, outputs):
        """ Calculate loss function by using inputs and outputs. """
        metrics = {}

        tgt_len = layers.reduce_sum(
            layers.reduce_sum(inputs["tgt_mask"], dim=1) - 1)
        tgt_len.stop_gradient = True

        label = inputs["tgt_token"][:, 1:]
        if self.label_smooth > 0:
            one_hot_label = layers.one_hot(label, self.num_token_embeddings)
            smooth_label = layers.label_smooth(one_hot_label,
                                               epsilon=self.label_smooth,
                                               dtype=self._dtype)
            nll = layers.cross_entropy(outputs["dec_pred"],
                                       smooth_label,
                                       soft_label=True,
                                       ignore_index=self.padding_idx)
        else:
            nll = layers.cross_entropy(outputs["dec_probs"],
                                       label,
                                       ignore_index=self.padding_idx)
        nll = layers.reduce_sum(nll, dim=1)
        token_nll = layers.reduce_sum(nll) / tgt_len
        nll = layers.reduce_mean(nll)
        metrics["nll"] = nll
        metrics["token_nll"] = token_nll
        loss = nll

        if self.num_latent > 0 and self.with_bow:
            bow_probs = F.unsqueeze(outputs["bow_probs"], [1])
            bow_probs = layers.expand(bow_probs, [1, label.shape[1], 1])
            if self.label_smooth > 0:
                bow = layers.cross_entropy(bow_probs,
                                           smooth_label,
                                           soft_label=True,
                                           ignore_index=self.padding_idx)
            else:
                bow = layers.cross_entropy(bow_probs,
                                           label,
                                           ignore_index=self.padding_idx)
            bow = layers.reduce_sum(bow, dim=1)
            token_bow = layers.reduce_sum(bow) / tgt_len
            bow = layers.reduce_mean(bow)
            metrics["bow"] = bow
            metrics["token_bow"] = token_bow
            loss = loss + bow

        if self.num_latent > 0 and self.use_discriminator:
            dis = 0.0 - (layers.log(outputs["pos_probs"]) +
                         layers.log(1.0 - outputs["neg_probs"]))
            dis = layers.reduce_mean(dis)
            metrics["dis"] = dis
            loss = loss + dis * self.dis_ratio

        metrics["loss"] = loss
        metrics["token_num"] = tgt_len
        return metrics
Example #6
0
 def test_label_smooth(self):
     program = Program()
     with program_guard(program):
         label = layers.data(name="label", shape=[1], dtype="float32")
         one_hot_label = layers.one_hot(input=label, depth=10)
         smooth_label = layers.label_smooth(
             label=one_hot_label, epsilon=0.1, dtype="float32")
         self.assertIsNotNone(smooth_label)
     print(str(program))
Example #7
0
 def test_label_smooth(self):
     program = Program()
     with program_guard(program):
         label = layers.data(name="label", shape=[1], dtype="float32")
         one_hot_label = layers.one_hot(input=label, depth=10)
         smooth_label = layers.label_smooth(
             label=one_hot_label, epsilon=0.1, dtype="float32")
         self.assertIsNotNone(smooth_label)
     print(str(program))
Example #8
0
    def __call__(self, predict, label, weights):
        if self.label_smooth_eps:
            label_out = layers.label_smooth(label=layers.one_hot(
                input=label, depth=predict.shape[-1]),
                                            epsilon=self.label_smooth_eps)

        cost = layers.softmax_with_cross_entropy(
            logits=predict,
            label=label_out,
            soft_label=True if self.label_smooth_eps else False)
        weighted_cost = cost * weights
        sum_cost = layers.reduce_sum(weighted_cost)
        token_num = layers.reduce_sum(weights)
        token_num.stop_gradient = True
        avg_cost = sum_cost / token_num
        return sum_cost, avg_cost, token_num
Example #9
0
    def forward(self, outputs, labels):
        predict, (label, weights) = outputs[0], labels
        if self.label_smooth_eps:
            label = layers.label_smooth(label=layers.one_hot(
                input=label, depth=predict.shape[-1]),
                                        epsilon=self.label_smooth_eps)

        cost = layers.softmax_with_cross_entropy(
            logits=predict,
            label=label,
            soft_label=True if self.label_smooth_eps else False)
        weighted_cost = cost * weights
        sum_cost = layers.reduce_sum(weighted_cost)
        token_num = layers.reduce_sum(weights)
        token_num.stop_gradient = True
        avg_cost = sum_cost / token_num
        return avg_cost
Example #10
0
    def build_model(self, enc_input, dec_input, tgt_label, label_weights):
        """Build the model with source encoding and target decoding"""

        enc_word_output, enc_sen_output = self.encode(enc_input)
        dec_output = self.decode(dec_input, enc_word_output, enc_sen_output)

        predict_token_idx = layers.argmax(dec_output, axis=-1)
        correct_token_idx = layers.cast(layers.equal(
            tgt_label, layers.reshape(predict_token_idx, shape=[-1, 1])),
                                        dtype='float32')
        weighted_correct = layers.elementwise_mul(x=correct_token_idx,
                                                  y=label_weights,
                                                  axis=0)
        sum_correct = layers.reduce_sum(weighted_correct)
        sum_correct.stop_gradient = True

        # Padding index do not contribute to the total loss. The weights is used to
        # cancel padding index in calculating the loss.
        if self._label_smooth_eps:
            # TODO: use fluid.input.one_hot after softmax_with_cross_entropy removing
            # the enforcement that the last dimension of label must be 1.
            tgt_label = layers.label_smooth(label=layers.one_hot(
                input=tgt_label, depth=self.voc_size),
                                            epsilon=self._label_smooth_eps)

        cost = layers.softmax_with_cross_entropy(
            logits=dec_output,
            label=tgt_label,
            soft_label=True if self._label_smooth_eps else False)

        weighted_cost = layers.elementwise_mul(x=cost, y=label_weights, axis=0)
        sum_cost = layers.reduce_sum(weighted_cost)
        token_num = layers.reduce_sum(label_weights)
        token_num.stop_gradient = True
        avg_cost = sum_cost / token_num

        graph_vars = {
            "loss": avg_cost,
            "sum_correct": sum_correct,
            "token_num": token_num,
        }
        for k, v in graph_vars.items():
            v.persistable = True

        return graph_vars
Example #11
0
def std_gen(batch_size=8, seed=None):
    with dg.no_grad():
        model_cache.train_mode = False
        model_cache.initialized = False
        if seed is not None:
            rds.rng = np.random.RandomState(seed)
        elif rds.rng is None:
            rds.rng = np.random
        G = model_cache.G
        x_np = rds.rng.randn(batch_size,140).astype('float32')
        y_np = rds.rng.randint(0,1000,size=[batch_size]).astype('int64')
        x = dg.to_variable(x_np)
        y = dg.to_variable(y_np)
        y_hot = layers.one_hot(layers.unsqueeze(y,[1]), depth=1000)
        img_pd = G(x, y_hot)
        img = np.uint8(img_pd.numpy().clip(0,1)*255)
        imgs = []
        for i in range(len(img)):
            imgs += [Image.fromarray(img[i].transpose([1,2,0]))]
        return imgs
Example #12
0
def renorm_gen_interpolate(batch_size=8, seed=None, out_path='data/out.gif'):
    with dg.no_grad():
        model_cache.train_mode = True
        model_cache.initialized = True
        if seed is not None:
            rds.rng = np.random.RandomState(seed)
        elif rds.rng is None:
            rds.rng = np.random
        G = model_cache.G
        x_np = rds.rng.randn(batch_size, 140).astype('float32')
        y_np = rds.rng.randint(0, 1000, size=[batch_size]).astype('int64')
        x = dg.to_variable(x_np)
        y = dg.to_variable(y_np)
        y_hot = layers.one_hot(layers.unsqueeze(y, [1]), depth=1000)
        y_embed = G.embed_y(y_hot)
        G(x, y_embed, True)
        model_cache.train_mode = False
        model_cache.initialized = True
        x = layers.concat([x, x[:1]], 0)
        y_embed = layers.concat([y_embed, y_embed[:1]], 0)
        imgs = []
        for i in range(batch_size):
            for j in range(40):
                alpha = j / (40 - 1)
                _x = (1 - alpha) * x[i:i + 1] + alpha * x[i + 1:i + 2]
                _y_embed = (1 - alpha
                            ) * y_embed[i:i + 1] + alpha * y_embed[i + 1:i + 2]
                img_pd = G(_x, _y_embed, True)
                img = np.uint8(img_pd.numpy().clip(0, 1) * 255)[0].transpose(
                    [1, 2, 0])
                imgs.append(Image.fromarray(img))
                stdout.write(f'{i*40+j+1}/{40*batch_size}\r')
                stdout.flush()
        print('')
        imgs[0].save(out_path,
                     save_all=True,
                     append_images=imgs[1:],
                     duration=40,
                     loop=0)
        return Image.open(out_path)
Example #13
0
def transformer(src_vocab_size,
                trg_vocab_size,
                max_length,
                n_layer,
                n_head,
                d_key,
                d_value,
                d_model,
                d_inner_hid,
                prepostprocess_dropout,
                attention_dropout,
                relu_dropout,
                preprocess_cmd,
                postprocess_cmd,
                weight_sharing,
                label_smooth_eps,
                bos_idx=0,
                is_test=False,
                model_input=None):
    """
        transformer main
    """
    if weight_sharing:
        assert src_vocab_size == trg_vocab_size, (
            "Vocabularies in source and target should be same for weight sharing."
        )
    enc_inputs = (model_input.src_word, model_input.src_pos,
                  model_input.src_slf_attn_bias)
    dec_inputs = (model_input.trg_word, model_input.trg_pos,
                  model_input.trg_slf_attn_bias, model_input.trg_src_attn_bias)
    label = model_input.lbl_word
    weights = model_input.lbl_weight

    enc_output = wrap_encoder(src_vocab_size,
                              max_length,
                              n_layer,
                              n_head,
                              d_key,
                              d_value,
                              d_model,
                              d_inner_hid,
                              prepostprocess_dropout,
                              attention_dropout,
                              relu_dropout,
                              preprocess_cmd,
                              postprocess_cmd,
                              weight_sharing,
                              enc_inputs,
                              bos_idx=bos_idx)

    predict = wrap_decoder(
        trg_vocab_size,
        max_length,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        prepostprocess_dropout,
        attention_dropout,
        relu_dropout,
        preprocess_cmd,
        postprocess_cmd,
        weight_sharing,
        dec_inputs,
        enc_output,
    )

    # Padding index do not contribute to the total loss. The weights is used to
    # cancel padding index in calculating the loss.
    if label_smooth_eps:
        label = layers.label_smooth(label=layers.one_hot(input=label,
                                                         depth=trg_vocab_size),
                                    epsilon=label_smooth_eps)

    cost = layers.softmax_with_cross_entropy(
        logits=predict,
        label=label,
        soft_label=True if label_smooth_eps else False)
    weighted_cost = cost * weights
    sum_cost = layers.reduce_sum(weighted_cost)
    token_num = layers.reduce_sum(weights)
    token_num.stop_gradient = True
    avg_cost = sum_cost / token_num
    res = [sum_cost, avg_cost, predict, token_num]
    return res
Example #14
0
    def inference(self, model, inputs, outputs):
        """
        Run inference.

        Args:
            inputs(dict): Its key is input name(str) and its value is a Variable.
            model(object): A generate model. Need to implement `_generation_network` and `_calc_logits`.

        Returns:
            dict(str:Variable): Its key is output name(str) and its value is a Variable.
        """
        # prepare while loop
        max_len = layers.fill_constant(
            shape=[1], dtype="int64", value=self.max_dec_len, force_cpu=True)
        min_len = layers.fill_constant(
            shape=[1], dtype="int64", value=self.min_dec_len, force_cpu=True)
        step_idx = layers.fill_constant(
            shape=[1], dtype="int64", value=0, force_cpu=True)

        ids = layers.array_write(layers.reshape(inputs["tgt_ids"], (-1, 1)), step_idx)
        pos_biases = layers.array_write(layers.reshape(inputs["tgt_pos"], (-1, 1)), step_idx)
        scores = layers.array_write(inputs["init_score"], step_idx)
        tgt_generation_mask = layers.array_write(inputs["tgt_generation_mask"], step_idx)
        parent_idx = inputs["parent_idx"]

        if self.decoding_strategy == "beam_search":
            beam_size = self.beam_size
        else:
            beam_size = 1

        eos_penalty = np.zeros(self.vocab_size, dtype="float32")
        eos_penalty[self.eos_id] = -1e9
        eos_penalty = layers.assign(eos_penalty)

        token_penalty = np.zeros(self.vocab_size, dtype="float32")
        token_penalty[self.unk_id] = -1e9
        if self.mask_id >= 0:
            token_penalty[self.mask_id] = -1e9
        token_penalty = layers.assign(token_penalty)

        # start while loop
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)
        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            pos_bias = layers.array_read(array=pos_biases, i=step_idx)
            pos_bias = layers.gather(input=pos_bias, index=parent_idx)

            tmp_tgt_generation_mask = layers.array_read(tgt_generation_mask, i=step_idx)
            dtype = tmp_tgt_generation_mask.dtype

            append_mask = layers.fill_constant_batch_size_like(
                    input=pre_ids,
                    value=1.0,
                    shape=[-1, 1, 1],
                    dtype=dtype)
            tmp_tgt_generation_mask = layers.concat([tmp_tgt_generation_mask, append_mask], axis=2)
            pre_mask = tmp_tgt_generation_mask = layers.gather(input=tmp_tgt_generation_mask, index=parent_idx)

            pre_sent = layers.fill_constant_batch_size_like(
                    input=pre_mask,
                    value=1,
                    shape=[-1, 1, 1],
                    dtype=pre_ids.dtype)

            if self.continuous_position:
                pre_pos = layers.elementwise_mul(
                    x=layers.fill_constant_batch_size_like(
                        input=pre_mask,
                        value=1,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype), y=step_idx, axis=0) + pos_bias
            else:
                pre_pos = layers.elementwise_mul(
                    x=layers.fill_constant_batch_size_like(
                        input=pre_mask,
                        value=1,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype), y=step_idx, axis=0)

            if self.use_role:
                pre_role = layers.fill_constant_batch_size_like(
                        input=pre_mask,
                        value=0,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype)
            else:
                pre_role = None

            dec_out, _ = model._generation_network(
                token_ids=pre_ids,
                type_ids=pre_sent,
                pos_ids=pre_pos,
                role_ids=pre_role,
                generation_mask=tmp_tgt_generation_mask,
                gather_idx=parent_idx)
            logits = model._calc_logits(dec_out)

            # ignore unk and mask token
            if self.ignore_unk:
                logits = layers.elementwise_add(logits, token_penalty, axis=1)

            # min dec length
            min_len_cond = layers.less_than(x=step_idx, y=min_len)
            def min_len_penalty():
                """Plus minimum length penalty."""
                return layers.elementwise_add(logits, eos_penalty, axis=1)
            def no_penalty():
                """No penalty."""
                return logits
            logits = layers.case([(min_len_cond, min_len_penalty)], default=no_penalty)

            # get probs
            probs = layers.softmax(logits / self.temperature)

            if self.decoding_strategy == "beam_search":
                topk_scores, topk_indices = layers.topk(
                    input=probs, k=beam_size)
            else:
                if self.decoding_strategy.startswith("sampling"):
                    sampling_ids = layers.sampling_id(probs, dtype="int")
                elif self.decoding_strategy.startswith("topk_sampling"):
                    topk_probs, _ = layers.topk(input=probs, k=self.topk)
                    ge_cond = layers.cast(
                        layers.greater_equal(
                            probs,
                            layers.unsqueeze(topk_probs[:, -1], [1])),
                        "float32")
                    old_probs = probs
                    probs = probs * ge_cond / layers.reduce_sum(topk_probs, dim=-1, keep_dim=True)
                    sampling_ids = layers.sampling_id(probs, dtype="int")
                    probs = old_probs
                else:
                    raise ValueError(self.decoding_strategy)

                sampling_scores = layers.one_hot(
                    layers.unsqueeze(sampling_ids, [1]), probs.shape[1]
                )
                sampling_scores = sampling_scores * probs - (1 - sampling_scores) * 1e3
                topk_scores, topk_indices = layers.topk(
                    input=sampling_scores, k=1)

            pre_len = layers.cast(step_idx, "float32")
            layers.increment(x=step_idx, value=1.0, in_place=True)
            cur_len = layers.cast(step_idx, "float32")

            # update scores
            if self.length_average:
                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores), y=pre_scores * pre_len, axis=0) / cur_len
            elif self.length_penalty > 0:
                pre_lp = layers.pow((5 + pre_len) / 6, self.length_penalty)
                cur_lp = layers.pow((5 + cur_len) / 6, self.length_penalty)
                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores), y=pre_scores * pre_lp, axis=0) / cur_lp
            else:
                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores), y=pre_scores, axis=0)
            topk_indices = layers.lod_reset(topk_indices, pre_ids)
            accu_scores = layers.lod_reset(accu_scores, pre_ids)
            selected_ids, selected_scores, gather_idx = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=accu_scores,
                beam_size=beam_size,
                end_id=self.eos_id,
                return_parent_idx=True)

            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.array_write(pre_mask, i=step_idx, array=tgt_generation_mask)
            layers.array_write(pos_bias, i=step_idx, array=pos_biases)

            layers.assign(gather_idx, parent_idx)

            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=beam_size, end_id=self.eos_id)

        predictions = {
            "finished_ids": finished_ids,
            "finished_scores": finished_scores,
            "token_ids": inputs["token_ids"],
            "data_id": inputs["data_id"]
        }
        return predictions
Example #15
0
def forward_transformer(src_vocab_size,
                trg_vocab_size,
                max_length,
                n_layer,
                n_head,
                d_key,
                d_value,
                d_model,
                d_inner_hid,
                prepostprocess_dropout,
                attention_dropout,
                relu_dropout,
                preprocess_cmd,
                postprocess_cmd,
                weight_sharing,
                embedding_sharing,
                label_smooth_eps,
                use_py_reader=False,
                is_test=False,
                params_type="normal",
                all_data_inputs=None):
    """
        transformer
    """
    if embedding_sharing:
        assert src_vocab_size == trg_vocab_size, (
            "Vocabularies in source and target should be same for weight sharing."
        )

    data_input_names = encoder_data_input_fields + \
                decoder_data_input_fields[:-1] + label_data_input_fields + dense_bias_input_fields

    if use_py_reader:
        all_inputs = all_data_inputs
    else:
        all_inputs = make_all_inputs(data_input_names)

    enc_inputs_len = len(encoder_data_input_fields)
    dec_inputs_len = len(decoder_data_input_fields[:-1])
    enc_inputs = all_inputs[0:enc_inputs_len]
    dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + dec_inputs_len]
    real_label = all_inputs[enc_inputs_len + dec_inputs_len]
    weights = all_inputs[enc_inputs_len + dec_inputs_len + 1]
    reverse_label = all_inputs[enc_inputs_len + dec_inputs_len + 2]

    enc_output = wrap_encoder(
        src_vocab_size,
        max_length,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        prepostprocess_dropout,
        attention_dropout,
        relu_dropout,
        preprocess_cmd,
        postprocess_cmd,
        weight_sharing,
        embedding_sharing,
        enc_inputs,
        params_type=params_type)

    predict = wrap_decoder(
        trg_vocab_size,
        max_length,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        prepostprocess_dropout,
        attention_dropout,
        relu_dropout,
        preprocess_cmd,
        postprocess_cmd,
        weight_sharing,
        embedding_sharing,
        dec_inputs,
        enc_output, is_train = True if not is_test else False,
        params_type=params_type)

    # Padding index do not contribute to the total loss. The weights is used to
    # cancel padding index in calculating the loss.
    if label_smooth_eps:
        label = layers.one_hot(input=real_label, depth=trg_vocab_size)
        label = label * (1 - label_smooth_eps) + (1 - label) * (
            label_smooth_eps / (trg_vocab_size - 1))
        label.stop_gradient = True
    else:
        label = real_label

    cost = layers.softmax_with_cross_entropy(
        logits=predict,
        label=label,
        soft_label=True if label_smooth_eps else False)
    weighted_cost = cost * weights
    sum_cost = layers.reduce_sum(weighted_cost)
    sum_cost.persistable = True
    token_num = layers.reduce_sum(weights)
    token_num.persistable = True
    token_num.stop_gradient = True
    avg_cost = sum_cost / token_num

    sen_count = layers.shape(dec_inputs[0])[0]
    batch_predict = layers.reshape(predict, shape = [sen_count, -1, ModelHyperParams.trg_vocab_size])
    #batch_label = layers.reshape(real_label, shape=[sen_count, -1])
    batch_weights = layers.reshape(weights, shape=[sen_count, -1, 1])
    return sum_cost, avg_cost, token_num, batch_predict, cost, sum_cost, real_label, batch_weights
Example #16
0
    def _init_train(self):

        instances = self.instances
        Backbone = self.Backbone
        bb_conf = self.bb_conf
        bb_name = self.bb_name
        dev_count = self.dev_count
        num_instances = len(instances)
        mrs = self.mrs

        # set first_target/main task instance
        main_inst = None
        for inst in instances:
            if inst.is_target:
                main_inst = inst
                inst.is_first_target = True
                break
        main_conf = main_inst.config
        if not os.path.exists(main_conf['save_path']):
            os.makedirs(main_conf['save_path'])

        # prepare backbone
        train_backbone = Backbone(bb_conf, phase='train')
        pred_backbone = Backbone(bb_conf, phase='pred')

        # create reader, task
        # then check i/o across reader, backbone and task_layer
        task_attrs = []
        pred_task_attrs = []
        for inst in instances:

            train_reader = inst.Reader(inst.config, phase='train')
            inst.reader['train'] = train_reader
            train_parad = inst.Paradigm(inst.config,
                                        phase='train',
                                        backbone_config=bb_conf)
            inst.task_layer['train'] = train_parad
            task_attr_from_reader = _encode_inputs(
                train_parad.inputs_attrs['reader'], inst.name)
            task_attrs.append(task_attr_from_reader)

            _check_io(train_backbone.inputs_attr,
                      train_reader.outputs_attr,
                      in_name=bb_name + '_backbone',
                      out_name='reader.train')
            _check_io(train_parad.inputs_attrs['reader'],
                      train_reader.outputs_attr,
                      in_name='task_paradigm.train.reader',
                      out_name='reader.train')
            _check_io(train_parad.inputs_attrs['backbone'],
                      train_backbone.outputs_attr,
                      in_name='task_paradigm.train.backbone',
                      out_name=bb_name + '_backbone')

            if inst.is_target:
                if 'pred_file' not in inst.config:
                    inst.config['pred_file'] = ''
                pred_reader = inst.Reader(inst.config, phase='pred')
                pred_parad = inst.Paradigm(inst.config,
                                           phase='pred',
                                           backbone_config=bb_conf)
                # inst.reader['pred'] = pred_reader # 这里创建的reader是个假reader,只是为了读取output_attr而已,所以不做保存
                inst.task_layer['pred'] = pred_parad
                # 框架有巨坑,先这样写吧
                task_attr_from_reader = _encode_inputs(
                    pred_parad.inputs_attrs['reader'], inst.name)
                pred_task_attrs.append(task_attr_from_reader)
                # task_attr = pred_parad.inputs_attrs['reader']
                _check_io(pred_backbone.inputs_attr,
                          pred_reader.outputs_attr,
                          in_name=bb_name + '_backbone',
                          out_name='reader.pred')
                _check_io(pred_parad.inputs_attrs['reader'],
                          pred_reader.outputs_attr,
                          in_name='task_paradigm.pred.reader',
                          out_name='reader.pred')
                _check_io(pred_parad.inputs_attrs['backbone'],
                          pred_backbone.outputs_attr,
                          in_name='task_paradigm.pred.backbone',
                          out_name=bb_name + '_backbone')

        # merge reader input attrs from backbone and task_instances
        joint_input_names, joint_shape_and_dtypes, name_to_position = merge_input_attrs(
            train_backbone.inputs_attr, task_attrs)
        pred_joint_input_names, pred_joint_shape_and_dtypes, _ = merge_input_attrs(
            pred_backbone.inputs_attr,
            pred_task_attrs,
            insert_taskid=False,
            insert_batchsize=False,
            insert_seqlen=False,
            insert_batchsize_x_seqlen=False)
        # shapes: [task_id, shapes_of_backbone, shapes_of_inst1, ..., shapes_of_instN]

        if DEBUG:
            print('----- for debug -----')
            print('joint input names:')
            print(joint_input_names)
            print('joint input shape and dtypes:')
            print(joint_shape_and_dtypes)

        # load data
        for inst in instances:
            print(inst.name + ": preparing data...")
            inst.reader['train'].load_data()

        # merge dataset iterators and create net input vars
        iterators = []
        prefixes = []
        mrs = []
        for inst in instances:
            iterators.append(inst.reader['train'].iterator())
            prefixes.append(inst.name)
            mrs.append(inst.mix_ratio)

        joint_iterator_fn = create_joint_iterator_fn(iterators,
                                                     prefixes,
                                                     joint_shape_and_dtypes,
                                                     mrs,
                                                     name_to_position,
                                                     dev_count=dev_count,
                                                     verbose=VERBOSE)

        input_attrs = [[
            i, j, k
        ] for i, (j, k) in zip(joint_input_names, joint_shape_and_dtypes)]
        pred_input_attrs = [[i, j, k] for i, (
            j, k) in zip(pred_joint_input_names, pred_joint_shape_and_dtypes)]
        net_inputs = create_net_inputs(input_attrs,
                                       async=True,
                                       iterator_fn=joint_iterator_fn,
                                       dev_count=dev_count,
                                       n_prefetch=3)

        # build backbone and task layers
        # 不指定scope名字会挂,框架有坑
        train_prog = fluid.default_main_program()
        train_init_prog = fluid.default_startup_program()
        # 别用unique_name.guard了,没用的,无法作用到param_attr里的name上
        # with fluid.unique_name.guard("backbone-"):
        bb_output_vars = train_backbone.build(net_inputs,
                                              scope_name='__paddlepalm_')
        assert sorted(bb_output_vars.keys()) == sorted(
            train_backbone.outputs_attr.keys())
        # for block in train_init_prog.blocks:
        #     for var in block.vars:
        #         print(var)

        # 会挂
        # 这里是否有必要新建一个program?是的,被坑死了
        pred_prog = fluid.Program()
        pred_init_prog = fluid.Program()

        with fluid.program_guard(main_program=pred_prog,
                                 startup_program=pred_init_prog):
            # with fluid.unique_name.guard():
            pred_net_inputs = create_net_inputs(pred_input_attrs)
            # 别用unique_name.guard了,没用的,无法作用到param_attr里的name上
            # with fluid.unique_name.guard("backbone-"):
            pred_bb_output_vars = pred_backbone.build(
                pred_net_inputs, scope_name='__paddlepalm_')

        fluid.framework.switch_main_program(train_prog)
        fluid.framework.switch_startup_program(train_init_prog)

        # pred_backbone = train_backbone
        # pred_bb_output_vars = bb_output_vars

        task_output_vars = {}
        for inst in instances:
            task_inputs = {'backbone': bb_output_vars}
            task_inputs_from_reader = _decode_inputs(net_inputs, inst.name)
            task_inputs['reader'] = task_inputs_from_reader

            scope = inst.task_reuse_scope + '/'
            with fluid.unique_name.guard(scope):
                output_vars = inst.build_task_layer(task_inputs,
                                                    phase='train',
                                                    scope=scope)
                output_vars = {
                    inst.name + '/' + key: val
                    for key, val in output_vars.items()
                }
                old = len(task_output_vars)  # for debug
                task_output_vars.update(output_vars)
                assert len(task_output_vars) - old == len(
                    output_vars)  # for debug

            # # prepare predict vars for saving inference model
            if inst.is_target:

                # task_attr = inst.task_layer['pred'].inputs_attrs['reader']
                # _input_names, _shape_and_dtypes, _ = merge_input_attrs(pred_backbone.inputs_attr, task_attr, insert_taskid=False)
                # pred_input_attrs = [[i, j, k] for i, (j,k) in zip(_input_names, _shape_and_dtypes)]

                with fluid.program_guard(pred_prog, pred_init_prog):
                    # pred_net_inputs = create_net_inputs(pred_input_attrs)
                    # 这里同时建立了pred阶段的backbone计算图,不知道是否会造成额外的显存开销(paddle不会计算运行路径)
                    cur_inputs = _decode_inputs(pred_net_inputs, inst.name)
                    inst.pred_input = cur_inputs
                    pred_task_inputs = {
                        'backbone': pred_bb_output_vars,
                        'reader': cur_inputs
                    }
                    scope = inst.task_reuse_scope + '/'
                    # 注意,这里不加上fluid.unique_name.guard会挂
                    with fluid.unique_name.guard(scope):
                        inst.build_task_layer(pred_task_inputs,
                                              phase='pred',
                                              scope=scope)

        bb_fetches = {k: v.name for k, v in bb_output_vars.items()}
        task_fetches = {k: v.name for k, v in task_output_vars.items()}
        # fetches = bb_fetches.copy() # 注意!框架在多卡时无法fetch变长维度的tensor,这里加入bb的out后会挂
        # fetches.update(task_fetches)
        fetches = task_fetches
        fetches['__task_id'] = net_inputs['__task_id'].name

        # compute loss
        task_id_var = net_inputs['__task_id']
        task_id_vec = layers.one_hot(task_id_var, num_instances)
        losses = fluid.layers.concat(
            [task_output_vars[inst.name + '/loss'] for inst in instances],
            axis=0)
        loss = layers.reduce_sum(task_id_vec * losses)

        main_reader = main_inst.reader['train']

        num_examples = main_reader.num_examples
        for inst in instances:
            max_train_steps = int(
                main_conf['num_epochs'] * inst.mix_ratio *
                (num_examples // main_conf['batch_size'] // dev_count))
            if inst.is_target:
                print('{}: expected train steps {}.'.format(
                    inst.name, max_train_steps))
            inst.steps_pur_epoch = inst.reader[
                'train'].num_examples // main_conf['batch_size'] // dev_count
            inst.expected_train_steps = max_train_steps

        global_max_train_steps = int(
            main_conf['num_epochs'] * sum(mrs) *
            (num_examples // main_conf['batch_size'] // dev_count))
        print(
            'Estimated overall train steps {}.'.format(global_max_train_steps))

        if 'warmup_proportion' in main_conf and main_conf[
                'warmup_proportion'] > 0:
            warmup_steps = int(global_max_train_steps *
                               main_conf['warmup_proportion'])
            print('Warmup steps: ' + str(warmup_steps))
        else:
            warmup_steps = 0
        # steps_pur_epoch = num_examples // main_conf['batch_size'] // dev_count

        # build optimizer
        # 其实也完全可以支持每个任务用它自己的optimizer
        if 'optimizer' in main_conf:
            optim_mod = importlib.import_module(OPTIMIZER_DIR + '.' +
                                                main_conf['optimizer'])
            optimize = getattr(optim_mod, OPTIMIZE_METHOD)
            optimize(loss, main_conf, max_train_steps, warmup_steps,
                     fluid.default_main_program())

            loss.persistable = True
            if main_conf.get('use_ema', False):
                assert 'ema_decay' in main_conf, "ema_decay should be set when use_ema is enabled."
                ema = fluid.optimizer.ExponentialMovingAverage(
                    main_conf['ema_decay'])
                ema.update()

        # prepare for train
        self.train_backbone = train_backbone
        self.train_program = fluid.CompiledProgram(
            fluid.default_main_program()).with_data_parallel(
                loss_name=loss.name)
        self.saver_program = fluid.default_main_program()

        self.main_inst = main_inst
        self.fetches = fetches
        self.has_init_train = True
        self.has_init_pred = True
        # self.max_train_steps = max_train_steps
        # self.steps_pur_epoch = steps_pur_epoch

        self.exe.run(fluid.default_startup_program())
        print("\nRandomly initialize parameters...\n")
Example #17
0
def transformer(src_vocab_size,
                trg_vocab_size,
                max_length,
                n_layer,
                n_head,
                d_key,
                d_value,
                d_model,
                d_inner_hid,
                prepostprocess_dropout,
                attention_dropout,
                relu_dropout,
                preprocess_cmd,
                postprocess_cmd,
                weight_sharing,
                label_smooth_eps,
                bos_idx=0,
                use_py_reader=False,
                is_test=False):
    if weight_sharing:
        assert src_vocab_size == trg_vocab_size, (
            "Vocabularies in source and target should be same for weight sharing."
        )

    data_input_names = encoder_data_input_fields + \
                decoder_data_input_fields[:-1] + label_data_input_fields

    if use_py_reader:
        all_inputs, reader = make_all_py_reader_inputs(data_input_names,
                                                       is_test)
    else:
        all_inputs = make_all_inputs(data_input_names)
    # print("all inputs",all_inputs)
    enc_inputs_len = len(encoder_data_input_fields)
    dec_inputs_len = len(decoder_data_input_fields[:-1])
    enc_inputs = all_inputs[0:enc_inputs_len]
    dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + dec_inputs_len]
    label = all_inputs[-2]
    weights = all_inputs[-1]

    enc_output = wrap_encoder(src_vocab_size, 64, n_layer, n_head, d_key,
                              d_value, d_model, d_inner_hid,
                              prepostprocess_dropout, attention_dropout,
                              relu_dropout, preprocess_cmd, postprocess_cmd,
                              weight_sharing, enc_inputs)

    predict = wrap_decoder(
        trg_vocab_size,
        max_length,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        prepostprocess_dropout,
        attention_dropout,
        relu_dropout,
        preprocess_cmd,
        postprocess_cmd,
        weight_sharing,
        dec_inputs,
        enc_output,
    )

    # Padding index do not contribute to the total loss. The weights is used to
    # cancel padding index in calculating the loss.
    if label_smooth_eps:
        label = layers.label_smooth(label=layers.one_hot(input=label,
                                                         depth=trg_vocab_size),
                                    epsilon=label_smooth_eps)

    cost = layers.softmax_with_cross_entropy(
        logits=predict,
        label=label,
        soft_label=True if label_smooth_eps else False)
    weighted_cost = cost * weights
    sum_cost = layers.reduce_sum(weighted_cost)
    token_num = layers.reduce_sum(weights)
    token_num.stop_gradient = True
    avg_cost = sum_cost / token_num
    return sum_cost, avg_cost, predict, token_num, reader if use_py_reader else None
Example #18
0
    def _build_decoder(self,
                       z_mean=None,
                       z_log_var=None,
                       enc_output=None,
                       mode='train',
                       beam_size=10):
        dec_input = layers.dropout(self.tar_emb,
                                   dropout_prob=self.dec_dropout_in,
                                   dropout_implementation="upscale_in_train")

        # `output_layer` will be used within BeamSearchDecoder
        output_layer = lambda x: layers.fc(x,
                                           size=self.tar_vocab_size,
                                           num_flatten_dims=len(x.shape) - 1,
                                           name="output_w")

        # `sample_output_layer` samples an id from the logits distribution instead of argmax(logits)
        # it will be used within BeamSearchDecoder
        sample_output_layer = lambda x: layers.unsqueeze(
            layers.one_hot(layers.unsqueeze(
                layers.sampling_id(layers.softmax(
                    layers.squeeze(output_layer(x), [1])),
                                   dtype='int'), [1]),
                           depth=self.tar_vocab_size), [1])

        if mode == 'train':
            latent_z = self._sampling(z_mean, z_log_var)
        else:
            latent_z = layers.gaussian_random_batch_size_like(
                self.tar, shape=[-1, self.latent_size])
        dec_first_hidden_cell = layers.fc(latent_z,
                                          2 * self.hidden_size *
                                          self.num_layers,
                                          name='fc_hc')
        dec_first_hidden, dec_first_cell = layers.split(
            dec_first_hidden_cell, 2)
        if self.num_layers > 1:
            dec_first_hidden = layers.split(dec_first_hidden, self.num_layers)
            dec_first_cell = layers.split(dec_first_cell, self.num_layers)
        else:
            dec_first_hidden = [dec_first_hidden]
            dec_first_cell = [dec_first_cell]
        dec_initial_states = [[h, c]
                              for h, c in zip(dec_first_hidden, dec_first_cell)
                              ]
        dec_cell = DecoderCell(self.num_layers, self.hidden_size, latent_z,
                               self.param_attr_initializer,
                               self.param_attr_scale, self.dec_dropout_out)

        if mode == 'train':
            dec_output, _ = rnn(cell=dec_cell,
                                inputs=dec_input,
                                initial_states=dec_initial_states,
                                sequence_length=self.tar_sequence_length)
            dec_output = output_layer(dec_output)

            return dec_output
        elif mode == 'greedy':
            start_token = 1
            end_token = 2
            max_length = 100
            beam_search_decoder = BeamSearchDecoder(
                dec_cell,
                start_token,
                end_token,
                beam_size=1,
                embedding_fn=self.tar_embeder,
                output_fn=output_layer)
            outputs, _ = dynamic_decode(beam_search_decoder,
                                        inits=dec_initial_states,
                                        max_step_num=max_length)
            return outputs

        elif mode == 'sampling':
            start_token = 1
            end_token = 2
            max_length = 100
            beam_search_decoder = BeamSearchDecoder(
                dec_cell,
                start_token,
                end_token,
                beam_size=1,
                embedding_fn=self.tar_embeder,
                output_fn=sample_output_layer)

            outputs, _ = dynamic_decode(beam_search_decoder,
                                        inits=dec_initial_states,
                                        max_step_num=max_length)
            return outputs
        else:
            print("mode not supprt", mode)
Example #19
0
def model():
    """model"""
    user_phone_brand_id = layers.data(name='user_phone_brand', shape=[1], dtype='int64')
    user_gender_id = layers.data(name='user_gender', shape=[1], dtype='int64')
    user_age_id = layers.data(name='user_age', shape=[1], dtype='int64')
    user_status_id = layers.data(name='user_status', shape=[1], dtype="int64")
    user_trade_id = fluid.layers.data(name='user_trade', shape=[1], dtype='int64')
    user_cater_id = fluid.layers.data(name='user_cater', shape=[1], dtype='int64')
    user_income_id = fluid.layers.data(name='user_income', shape=[1], dtype='int64')

    user_city_id = fluid.layers.data(name='user_city', shape=[1], dtype='int64')

    user_click_id = fluid.layers.data(name='user_click', shape=[1], dtype='int64')
    user_b_click_id = fluid.layers.data(name='user_b_click', shape=[1], dtype='int64')
    user_c_click_id = fluid.layers.data(name='user_c_click', shape=[1], dtype='int64')
    user_d_click_id = fluid.layers.data(name='user_d_click', shape=[1], dtype='int64')

    week_id = layers.data(name='week', shape=[1], dtype="int64")
    hour_id = layers.data(name='hour', shape=[1], dtype='int64')

    content_b_c_d_id = layers.data(name='content_b_c_d', shape=[1], dtype='int64')
    content_tags_id = layers.data(name='content_tags', shape=[1], dtype='int64', lod_level=1)
    content_subtags_id = layers.data(name='content_subtags', shape=[1], dtype='int64', lod_level=1)

    user_content_tag_click_id = layers.data(name='user_content_tag_click', shape=[1], dtype='int64')
    user_content_subtag_click_id = layers.data(name='user_content_subtag_click', shape=[1], dtype='int64')

    content_pctr_discrete_id = layers.data(name='content_pctr_discrete', shape=[1], dtype='int64')
    # dnn_score_discrete_id = layers.data(name='dnn_score_discrete', shape=[1], dtype='int64')

    content_pctr = layers.data(name='content_pctr', shape=[1], dtype='float32')
    # dnn_score = layers.data(name='dnn_score', shape=[1], dtype='float32')
    # content_emb = layers.data(name='content_emb', shape=[64], dtype='float32')
    # user_emb = layers.data(name='user_emb', shape=[64], dtype='float32')

    user_click_tags_id = layers.data(
        name='user_click_tags_id', shape=[1], dtype='int64', lod_level=1)
    user_click_subtags_id = layers.data(
        name='user_click_subtags_id', shape=[1], dtype='int64', lod_level=1)
    candidate_title_word = layers.data(name='candidate_title', shape=[1], dtype='int64', lod_level=1)
    candidate_subtitle_word = layers.data(name='candidate_subtitle', shape=[1], dtype='int64', lod_level=1)
    candidate_title_len_id = layers.data(name='candidate_title_len', shape=[1], dtype='int64')
    candidate_subtitle_len_id = layers.data(name='candidate_subtitle_len', shape=[1], dtype='int64')

    click_title_list = layers.data(name='click_title_list', shape=[1], dtype='int64', lod_level=2)
    click_subtitle_list = layers.data(name='click_subtitle_list', shape=[1], dtype='int64', lod_level=2)
    click_title_len_list = layers.data(name='click_title_len_list', shape=[1], dtype='int64', lod_level=1)
    click_subtitle_len_list = layers.data(name='click_subtitle_len_list', shape=[1], dtype='int64', lod_level=1)

    label = layers.data(name='label', shape=[1], dtype='int64')
    # dnn_score_discrete_id.name, dnn_score.name, content_emb.name,user_emb.name,
    load_list = [user_phone_brand_id, user_gender_id, user_age_id,
                  user_status_id, user_trade_id, user_cater_id, user_income_id,
                  user_city_id, user_click_id, user_b_click_id, user_c_click_id,
                  user_d_click_id, week_id, hour_id, content_b_c_d_id,
                  content_tags_id, content_subtags_id, user_content_tag_click_id,
                  user_content_subtag_click_id, content_pctr_discrete_id,
                  content_pctr,
                  user_click_tags_id, user_click_subtags_id, candidate_title_word,
                  candidate_subtitle_word, candidate_title_len_id, candidate_subtitle_len_id,
                  click_title_list, click_subtitle_list,
                  click_title_len_list, click_subtitle_len_list,
                  label]
    feed_order = [x.name for x in load_list]

    user_phone_brand_emb = layers.embedding(
        input=user_phone_brand_id, dtype='float32',
        size=[7, EMB_LEN], param_attr='user_phone_brand_emb', is_sparse=True)
    user_gender_emb = layers.embedding(
        input=user_gender_id, dtype='float32',
        size=[3, EMB_LEN], param_attr='user_gender_emb', is_sparse=True)
    user_age_emb = layers.embedding(
        input=user_age_id, dtype='float32',
        size=[8, EMB_LEN], param_attr='user_age_emb', is_sparse=True)
    user_status_emb = layers.embedding(
        input=user_status_id, dtype='float32',
        size=[3, EMB_LEN], is_sparse=True, param_attr='user_status_emb')
    user_trade_emb = layers.embedding(
        input=user_trade_id, dtype='float32',
        size=[24, EMB_LEN], is_sparse=True, param_attr='user_trade_emb')
    user_cater_emb = layers.embedding(
        input=user_cater_id, dtype='float32',
        size=[4, EMB_LEN], is_sparse=True, param_attr='user_cater_emb')
    user_income_emb = layers.embedding(
        input=user_income_id, dtype='float32',
        size=[6, EMB_LEN], is_sparse=True, param_attr='user_income_emb')

    user_city_emb = layers.embedding(
        input=user_city_id, dtype='float32',
        size=[4000, EMB_LEN], is_sparse=True, param_attr='user_city_emb')

    user_click_emb = layers.embedding(
        input=user_click_id, dtype='float32',
        size=[6, EMB_LEN], is_sparse=True, param_attr='user_click_emb')
    user_b_click_emb = layers.embedding(
        input=user_b_click_id, dtype='float32',
        size=[6, EMB_LEN], is_sparse=True, param_attr='user_b_click_emb')
    user_c_click_emb = layers.embedding(
        input=user_c_click_id, dtype='float32',
        size=[6, EMB_LEN], is_sparse=True, param_attr='user_c_click_emb')
    user_d_click_emb = layers.embedding(
        input=user_d_click_id, dtype='float32',
        size=[6, EMB_LEN], is_sparse=True, param_attr='user_d_click_emb')

    week_emb = layers.embedding(
        input=week_id, dtype='float32',
        size=[8, EMB_LEN], is_sparse=True, param_attr='week_emb')
    hour_emb = layers.embedding(
        input=hour_id, dtype='float32',
        size=[24, EMB_LEN], is_sparse=True, param_attr='hour_emb')

    content_b_c_d_emb = layers.embedding(
        input=content_b_c_d_id, dtype='float32',
        size=[3, EMB_LEN], is_sparse=True, param_attr='content_b_c_d_emb')

    content_tags_emb = layers.embedding(
        input=content_tags_id, size=[11, EMB_LEN], dtype='float32', is_sparse=True,
        param_attr=fluid.ParamAttr(
            name="content_tags_emb", learning_rate=0.5, regularizer=fluid.regularizer.L2Decay(1.0))
    )
    content_tags_emb_avg = fluid.layers.sequence_pool(input=content_tags_emb, pool_type='average')

    content_subtags_emb = layers.embedding(
        input=content_subtags_id, size=[65, EMB_LEN], dtype='float32', is_sparse=True,
        param_attr=fluid.ParamAttr(
            name="content_subtags_emb", learning_rate=0.5,
            regularizer=fluid.regularizer.L2Decay(1.0))
    )
    content_subtags_emb_avg = fluid.layers.sequence_pool(
        input=content_subtags_emb, pool_type='average')

    user_content_tag_click_emb = layers.embedding(
        input=user_content_tag_click_id, dtype='float32',
        size=[11 * 6, EMB_LEN], is_sparse=True, param_attr='user_content_tag_click_emb')
    user_content_subtag_click_emb = layers.embedding(
        input=user_content_subtag_click_id, dtype='float32',
        size=[65 * 6, EMB_LEN], is_sparse=True, param_attr='user_content_subtag_click_emb')

    content_pctr_discrete_emb = layers.embedding(
        input=content_pctr_discrete_id, dtype='float32',
        size=[55, EMB_LEN], is_sparse=True, param_attr='content_pctr_discrete_emb')
    # dnn_score_discrete_emb = layers.embedding(
    #     input=dnn_score_discrete_id, dtype='float32',
    #     size=[21, EMB_LEN], is_sparse=True, param_attr='dnn_score_discrete_emb')

    user_click_tags_id_emb = layers.embedding(
        input=user_click_tags_id, size=[11 * 6, EMB_LEN], dtype='float32', is_sparse=True,
        param_attr="user_content_tag_click_emb")
    user_click_tags_id_emb_avg = fluid.layers.sequence_pool(
        input=user_click_tags_id_emb, pool_type='average')
    user_click_subtags_id_emb = layers.embedding(
        input=user_click_subtags_id, size=[65 * 6, EMB_LEN], dtype='float32', is_sparse=True,
        param_attr="user_content_subtag_click_emb")
    user_click_subtags_id_emb_avg = fluid.layers.sequence_pool(
        input=user_click_subtags_id_emb, pool_type='average')

    # 候选内容feature生成
    cand_title_emb = layers.embedding(input=candidate_title_word, size=[19962, EMB_LEN], dtype='float32',
                                      is_sparse=False, param_attr='word_embedding')
    cand_title_conv_pool = nets.sequence_conv_pool(
        input=cand_title_emb, num_filters=NUM_FILTERS, filter_size=3,
        act="relu", pool_type="average", param_attr='title_emb_conv', bias_attr='title_emb_conv_b')

    cand_subtitle_emb = layers.embedding(input=candidate_subtitle_word, size=[19962, EMB_LEN], dtype='float32',
                                         is_sparse=False, param_attr='word_embedding')
    cand_subtitle_conv_pool = nets.sequence_conv_pool(
        input=cand_subtitle_emb, num_filters=NUM_FILTERS, filter_size=3,
        act="relu", pool_type="average", param_attr='subtitle_emb_conv', bias_attr='subtitle_emb_conv_b')

    cand_title_len_emb = layers.embedding(input=candidate_title_len_id, size=[100, EMB_LEN], dtype='float32',
                                          is_sparse=True, param_attr='title_len_emb')
    cand_subtitle_len_emb = layers.embedding(input=candidate_subtitle_len_id, size=[100, EMB_LEN], dtype='float32',
                                             is_sparse=True, param_attr='subtitle_len_emb')

    cand_title_inf = layers.concat(
        input=[cand_title_conv_pool, cand_subtitle_conv_pool,
               cand_title_len_emb, cand_subtitle_len_emb], axis=-1)
    cand_title_feature = layers.fc(
        input=cand_title_inf, size=32, act="relu", param_attr='title_feature_list') #共享参数

    # 用户历史点击内容feature生成
    click_title_emb = layers.embedding(input=click_title_list, size=[19962, EMB_LEN], dtype='float32',
                                       is_sparse=False, param_attr='word_embedding')
    click_title_drnn = fluid.layers.DynamicRNN()
    with click_title_drnn.block():
        title_emb = click_title_drnn.step_input(click_title_emb)
        click_title_conv_pool = nets.sequence_conv_pool(
            input=title_emb, num_filters=NUM_FILTERS, filter_size=3,
            act="relu", pool_type="average", param_attr='title_emb_conv', bias_attr='title_emb_conv_b')
        click_title_drnn.output(click_title_conv_pool)
    click_title_conv_pool_list = click_title_drnn()

    click_subtitle_emb = layers.embedding(input=click_subtitle_list, size=[19962, EMB_LEN], dtype='float32',
                                       is_sparse=False, param_attr='word_embedding')
    click_subtitle_drnn = fluid.layers.DynamicRNN()
    with click_subtitle_drnn.block():
        subtitle_emb = click_subtitle_drnn.step_input(click_subtitle_emb)
        click_subtitle_conv_pool = nets.sequence_conv_pool(
            input=subtitle_emb, num_filters=NUM_FILTERS, filter_size=3,
            act="relu", pool_type="average", param_attr='subtitle_emb_conv', bias_attr='subtitle_emb_conv_b')
        click_subtitle_drnn.output(click_subtitle_conv_pool)
    click_subtitle_conv_pool_list = click_subtitle_drnn()

    click_title_len_emb_list = layers.embedding(input=click_title_len_list, size=[100, EMB_LEN], dtype='float32',
                                          is_sparse=True, param_attr='title_len_emb')
    click_subtitle_len_emb_list = layers.embedding(input=click_subtitle_len_list, size=[100, EMB_LEN], dtype='float32',
                                          is_sparse=True, param_attr='subtitle_len_emb')

    click_title_inf_list = layers.concat(
        input=[click_title_conv_pool_list, click_subtitle_conv_pool_list,
               click_title_len_emb_list, click_subtitle_len_emb_list], axis=-1)
    click_title_feature_list = layers.fc(
        input=click_title_inf_list, size=32, act="relu", param_attr='title_feature_list') #共享参数
    user_click_title_feature = layers.sequence_pool(input=click_title_feature_list, pool_type="average")

    user_emb_feature = layers.concat(
        input=[user_phone_brand_emb, user_gender_emb, user_age_emb, user_status_emb, user_trade_emb,
               user_cater_emb, user_income_emb, user_city_emb,
               user_click_emb, user_b_click_emb, user_c_click_emb, user_d_click_emb], axis=1)
    content_emb_feature = layers.concat(
        input=[content_b_c_d_emb, content_tags_emb_avg, content_subtags_emb_avg,
               content_pctr_discrete_emb, cand_title_feature], axis=1)
    cross_emb_feature = layers.concat(
        input=[user_content_tag_click_emb, user_content_subtag_click_emb,
               user_click_tags_id_emb_avg, user_click_subtags_id_emb_avg,
               user_click_title_feature], axis=1)
    env_emb_feature = layers.concat(
        input=[week_emb, hour_emb], axis=1)

    combined_features = layers.concat(input=[
        user_emb_feature, content_emb_feature, cross_emb_feature, env_emb_feature], axis=1)

    fc1 = layers.fc(input=combined_features, size=200, act='relu', param_attr='fc1', bias_attr='fc1_b')
    fc2 = layers.fc(input=fc1, size=200, act="relu", param_attr='fc2', bias_attr='fc2_b')
    fc3 = layers.fc(input=fc2, size=200, act="relu", param_attr='fc3', bias_attr='fc3_b')

    content_pctr_discrete_id_one_hot = layers.one_hot(
        content_pctr_discrete_id, 55, allow_out_of_range=False)

    final_layer = layers.concat(input=[fc3, content_pctr, content_pctr_discrete_id_one_hot], axis=1)
    predict = layers.fc(
        input=final_layer, size=2, act="softmax",
        param_attr='final_predict', bias_attr='final_predict_b')

    auc = fluid.layers.auc(
        input=predict, label=label, num_thresholds=2 ** 12)
    cost = layers.cross_entropy(input=predict, label=label)
    avg_cost = layers.reduce_mean(cost)

    loader = fluid.io.DataLoader.from_generator(
        feed_list=load_list, capacity=256, use_double_buffer=True, iterable=True)

    return {'predict': predict, 'avg_cost': avg_cost, 'feed_order': feed_order, 'loader': loader, 'auc': auc}
Example #20
0
def encoder(x,
            y,
            vocab_size,
            emb_size,
            init_hidden=None,
            init_cell=None,
            para_name='',
            custom_samples=None,
            custom_probabilities=None,
            test_mode=False,
            args=None):
    x_emb = layers.embedding(input=x,
                             size=[vocab_size, emb_size],
                             dtype='float32',
                             is_sparse=False,
                             param_attr=fluid.ParamAttr(name='embedding_para'))
    rnn_input = x_emb
    rnn_outs = []
    rnn_outs_ori = []
    cells = []
    projs = []
    for i in range(args.num_layers):
        rnn_input = dropout(rnn_input, test_mode, args)
        if init_hidden and init_cell:
            h0 = layers.squeeze(layers.slice(init_hidden,
                                             axes=[0],
                                             starts=[i],
                                             ends=[i + 1]),
                                axes=[0])
            c0 = layers.squeeze(layers.slice(init_cell,
                                             axes=[0],
                                             starts=[i],
                                             ends=[i + 1]),
                                axes=[0])
        else:
            h0 = c0 = None
        rnn_out, cell, input_proj = lstmp_encoder(
            rnn_input, args.hidden_size, h0, c0,
            para_name + 'layer{}'.format(i + 1), emb_size, test_mode, args)
        rnn_out_ori = rnn_out
        if i > 0:
            rnn_out = rnn_out + rnn_input
        rnn_out = dropout(rnn_out, test_mode, args)
        cell = dropout(cell, test_mode, args)
        rnn_outs.append(rnn_out)
        rnn_outs_ori.append(rnn_out_ori)
        rnn_input = rnn_out
        cells.append(cell)
        projs.append(input_proj)

    softmax_weight = layers.create_parameter([vocab_size, emb_size],
                                             dtype="float32",
                                             name="softmax_weight")
    softmax_bias = layers.create_parameter([vocab_size],
                                           dtype="float32",
                                           name='softmax_bias')
    projection = layers.matmul(rnn_outs[-1], softmax_weight, transpose_y=True)
    projection = layers.elementwise_add(projection, softmax_bias)

    projection = layers.reshape(projection, shape=[-1, vocab_size])

    if args.sample_softmax and (not test_mode):
        loss = layers.sampled_softmax_with_cross_entropy(
            logits=projection,
            label=y,
            num_samples=args.n_negative_samples_batch,
            seed=args.random_seed)
    else:
        label = layers.one_hot(input=y, depth=vocab_size)
        loss = layers.softmax_with_cross_entropy(logits=projection,
                                                 label=label,
                                                 soft_label=True)
    return [x_emb, projection, loss], rnn_outs, rnn_outs_ori, cells, projs
    def _forward(self, inputs, is_training):
        """ Real forward process of model in different mode(train/test). """
        outputs = {}

        src_token = inputs["src_token"]
        src_mask = inputs["src_mask"]
        src_pos = inputs["src_pos"]
        src_type = inputs["src_type"]
        src_turn = inputs["src_turn"]

        tgt_token = inputs["tgt_token"][:, :-1]
        tgt_mask = inputs["tgt_mask"][:, :-1]
        tgt_pos = inputs["tgt_pos"][:, :-1]
        tgt_type = inputs["tgt_type"][:, :-1]
        tgt_turn = inputs["tgt_turn"][:, :-1]

        input_mask = layers.concat([src_mask, tgt_mask], axis=1)
        input_mask.stop_gradient = True
        src_embed = self.embedder(src_token, src_pos, src_type, src_turn)
        tgt_embed = self.embedder(tgt_token, tgt_pos, tgt_type, tgt_turn)
        embed = layers.concat([src_embed, tgt_embed], axis=1)
        embed = self.embed_layer_norm(embed)

        batch_size = src_token.shape[0]
        src_len = src_token.shape[1]
        tgt_len = tgt_token.shape[1]

        if self.num_latent > 0:
            post_embed, post_probs, post_logits = self._posteriori_network(
                input_mask, embed, batch_size, src_len, tgt_len)
            outputs["post_logits"] = post_logits

            if self.use_discriminator:
                pos_probs, neg_probs = self._discriminator_network(
                    input_mask, embed, batch_size, src_len, tgt_len, post_embed)
                outputs["pos_probs"] = pos_probs
                outputs["neg_probs"] = neg_probs

            if is_training:
                z = F.gumbel_softmax(post_logits, self.tau)
            else:
                indices = layers.argmax(post_logits, axis=1)
                z = layers.one_hot(F.unsqueeze(indices, [1]), self.num_latent)
            latent_embeddings = self.latent_embeddings
            latent_embed = layers.matmul(z, latent_embeddings)
            outputs["latent_embed"] = latent_embed
        else:
            latent_embed = None

        latent_embed, dec_probs = self._generation_network(
            input_mask, embed, batch_size, src_len, tgt_len, latent_embed)
        outputs["dec_probs"] = dec_probs

        if self.num_latent > 0 and self.with_bow:
            if self.two_layer_predictor:
                latent_embed = self.pre_bow_predictor(latent_embed)
            bow_logits = self.bow_predictor(latent_embed)
            bow_probs = layers.softmax(bow_logits)
            outputs["bow_probs"] = bow_probs

        return outputs
Example #22
0
def transformer(model_input,
                src_vocab_size,
                trg_vocab_size,
                max_length,
                n_layer,
                n_head,
                d_key,
                d_value,
                d_model,
                d_inner_hid,
                prepostprocess_dropout,
                attention_dropout,
                relu_dropout,
                preprocess_cmd,
                postprocess_cmd,
                weight_sharing,
                label_smooth_eps,
                bos_idx=0,
                is_test=False):
    if weight_sharing:
        assert src_vocab_size == trg_vocab_size, (
            "Vocabularies in source and target should be same for weight sharing."
        )

    enc_inputs = (model_input.src_word, model_input.src_pos,
                  model_input.src_slf_attn_bias)
    dec_inputs = (model_input.trg_word, model_input.trg_pos,
                  model_input.trg_slf_attn_bias, model_input.trg_src_attn_bias)
    label = model_input.lbl_word
    weights = model_input.lbl_weight

    enc_output = wrap_encoder(enc_inputs,
                              src_vocab_size,
                              max_length,
                              n_layer,
                              n_head,
                              d_key,
                              d_value,
                              d_model,
                              d_inner_hid,
                              prepostprocess_dropout,
                              attention_dropout,
                              relu_dropout,
                              preprocess_cmd,
                              postprocess_cmd,
                              weight_sharing,
                              bos_idx=bos_idx)

    predict = wrap_decoder(dec_inputs,
                           trg_vocab_size,
                           max_length,
                           n_layer,
                           n_head,
                           d_key,
                           d_value,
                           d_model,
                           d_inner_hid,
                           prepostprocess_dropout,
                           attention_dropout,
                           relu_dropout,
                           preprocess_cmd,
                           postprocess_cmd,
                           weight_sharing,
                           enc_output=enc_output)

    # Padding index do not contribute to the total loss. The weights is used to
    # cancel padding index in calculating the loss.
    if label_smooth_eps:
        # TODO: use fluid.input.one_hot after softmax_with_cross_entropy removing
        # the enforcement that the last dimension of label must be 1.
        label = layers.label_smooth(label=layers.one_hot(input=label,
                                                         depth=trg_vocab_size),
                                    epsilon=label_smooth_eps)

    cost = layers.softmax_with_cross_entropy(
        logits=predict,
        label=label,
        soft_label=True if label_smooth_eps else False)
    weighted_cost = layers.elementwise_mul(x=cost, y=weights, axis=0)
    sum_cost = layers.reduce_sum(weighted_cost)
    token_num = layers.reduce_sum(weights)
    token_num.stop_gradient = True
    avg_cost = sum_cost / token_num
    return sum_cost, avg_cost, predict, token_num