Ejemplo n.º 1
0
    def forward(self, enc_inputs, dec_inputs, label, weights):
        """
        forward
        :param enc_inputs:
        :param dec_inputs:
        :param label:
        :param weights:
        :return:
        """
        enc_output = self._wrap_encoder_layer(enc_inputs)
        predict = self._wrap_decoder_layer(dec_inputs, enc_output)
        if self._label_smooth_eps:
            label_out = layers.label_smooth(label=layers.one_hot(
                input=label, depth=self._trg_vocab_size),
                                            epsilon=self._label_smooth_eps)

        cost = layers.softmax_with_cross_entropy(
            logits=predict,
            label=label_out,
            soft_label=True if self._label_smooth_eps else False)
        weighted_cost = cost * weights
        sum_cost = layers.reduce_sum(weighted_cost)
        token_num = layers.reduce_sum(weights)
        token_num.stop_gradient = True
        avg_cost = sum_cost / token_num
        return sum_cost, avg_cost, predict, token_num
Ejemplo n.º 2
0
def transformer(
        src_vocab_size,
        trg_vocab_size,
        max_length,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        dropout_rate,
        label_smooth_eps, ):
    enc_inputs = make_all_inputs(encoder_data_input_fields +
                                 encoder_util_input_fields)

    enc_output = wrap_encoder(
        src_vocab_size,
        max_length,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        dropout_rate,
        enc_inputs, )
    dec_inputs = make_all_inputs(decoder_data_input_fields[:-1] +
                                 decoder_util_input_fields)

    predict = wrap_decoder(
        trg_vocab_size,
        max_length,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        dropout_rate,
        dec_inputs,
        enc_output, )

    # Padding index do not contribute to the total loss. The weights is used to
    # cancel padding index in calculating the loss.
    label, weights = make_all_inputs(label_data_input_fields)
    if label_smooth_eps:
        label = layers.label_smooth(
            label=layers.one_hot(
                input=label, depth=trg_vocab_size),
            epsilon=label_smooth_eps)
    cost = layers.softmax_with_cross_entropy(
        logits=predict,
        label=label,
        soft_label=True if label_smooth_eps else False)
    # cost = layers.softmax_with_cross_entropy(logits=predict, label=gold)
    weighted_cost = cost * weights
    sum_cost = layers.reduce_sum(weighted_cost)
    token_num = layers.reduce_sum(weights)
    avg_cost = sum_cost / token_num
    return sum_cost, avg_cost, predict, token_num
    def _collect_metrics(self, inputs, outputs):
        """ Calculate loss function by using inputs and outputs. """
        metrics = {}

        tgt_len = layers.reduce_sum(
            layers.reduce_sum(inputs["tgt_mask"], dim=1) - 1)
        tgt_len.stop_gradient = True

        label = inputs["tgt_token"][:, 1:]
        if self.label_smooth > 0:
            one_hot_label = layers.one_hot(label, self.num_token_embeddings)
            smooth_label = layers.label_smooth(one_hot_label,
                                               epsilon=self.label_smooth,
                                               dtype=self._dtype)
            nll = layers.cross_entropy(outputs["dec_pred"],
                                       smooth_label,
                                       soft_label=True,
                                       ignore_index=self.padding_idx)
        else:
            nll = layers.cross_entropy(outputs["dec_probs"],
                                       label,
                                       ignore_index=self.padding_idx)
        nll = layers.reduce_sum(nll, dim=1)
        token_nll = layers.reduce_sum(nll) / tgt_len
        nll = layers.reduce_mean(nll)
        metrics["nll"] = nll
        metrics["token_nll"] = token_nll
        loss = nll

        if self.num_latent > 0 and self.with_bow:
            bow_probs = F.unsqueeze(outputs["bow_probs"], [1])
            bow_probs = layers.expand(bow_probs, [1, label.shape[1], 1])
            if self.label_smooth > 0:
                bow = layers.cross_entropy(bow_probs,
                                           smooth_label,
                                           soft_label=True,
                                           ignore_index=self.padding_idx)
            else:
                bow = layers.cross_entropy(bow_probs,
                                           label,
                                           ignore_index=self.padding_idx)
            bow = layers.reduce_sum(bow, dim=1)
            token_bow = layers.reduce_sum(bow) / tgt_len
            bow = layers.reduce_mean(bow)
            metrics["bow"] = bow
            metrics["token_bow"] = token_bow
            loss = loss + bow

        if self.num_latent > 0 and self.use_discriminator:
            dis = 0.0 - (layers.log(outputs["pos_probs"]) +
                         layers.log(1.0 - outputs["neg_probs"]))
            dis = layers.reduce_mean(dis)
            metrics["dis"] = dis
            loss = loss + dis * self.dis_ratio

        metrics["loss"] = loss
        metrics["token_num"] = tgt_len
        return metrics
Ejemplo n.º 4
0
 def test_label_smooth(self):
     program = Program()
     with program_guard(program):
         label = layers.data(name="label", shape=[1], dtype="float32")
         one_hot_label = layers.one_hot(input=label, depth=10)
         smooth_label = layers.label_smooth(
             label=one_hot_label, epsilon=0.1, dtype="float32")
         self.assertIsNotNone(smooth_label)
     print(str(program))
Ejemplo n.º 5
0
 def test_label_smooth(self):
     program = Program()
     with program_guard(program):
         label = layers.data(name="label", shape=[1], dtype="float32")
         one_hot_label = layers.one_hot(input=label, depth=10)
         smooth_label = layers.label_smooth(
             label=one_hot_label, epsilon=0.1, dtype="float32")
         self.assertIsNotNone(smooth_label)
     print(str(program))
Ejemplo n.º 6
0
    def __call__(self, predict, label, weights):
        if self.label_smooth_eps:
            label_out = layers.label_smooth(label=layers.one_hot(
                input=label, depth=predict.shape[-1]),
                                            epsilon=self.label_smooth_eps)

        cost = layers.softmax_with_cross_entropy(
            logits=predict,
            label=label_out,
            soft_label=True if self.label_smooth_eps else False)
        weighted_cost = cost * weights
        sum_cost = layers.reduce_sum(weighted_cost)
        token_num = layers.reduce_sum(weights)
        token_num.stop_gradient = True
        avg_cost = sum_cost / token_num
        return sum_cost, avg_cost, token_num
Ejemplo n.º 7
0
    def forward(self, outputs, labels):
        predict, (label, weights) = outputs[0], labels
        if self.label_smooth_eps:
            label = layers.label_smooth(label=layers.one_hot(
                input=label, depth=predict.shape[-1]),
                                        epsilon=self.label_smooth_eps)

        cost = layers.softmax_with_cross_entropy(
            logits=predict,
            label=label,
            soft_label=True if self.label_smooth_eps else False)
        weighted_cost = cost * weights
        sum_cost = layers.reduce_sum(weighted_cost)
        token_num = layers.reduce_sum(weights)
        token_num.stop_gradient = True
        avg_cost = sum_cost / token_num
        return avg_cost
Ejemplo n.º 8
0
    def build_model(self, enc_input, dec_input, tgt_label, label_weights):
        """Build the model with source encoding and target decoding"""

        enc_word_output, enc_sen_output = self.encode(enc_input)
        dec_output = self.decode(dec_input, enc_word_output, enc_sen_output)

        predict_token_idx = layers.argmax(dec_output, axis=-1)
        correct_token_idx = layers.cast(layers.equal(
            tgt_label, layers.reshape(predict_token_idx, shape=[-1, 1])),
                                        dtype='float32')
        weighted_correct = layers.elementwise_mul(x=correct_token_idx,
                                                  y=label_weights,
                                                  axis=0)
        sum_correct = layers.reduce_sum(weighted_correct)
        sum_correct.stop_gradient = True

        # Padding index do not contribute to the total loss. The weights is used to
        # cancel padding index in calculating the loss.
        if self._label_smooth_eps:
            # TODO: use fluid.input.one_hot after softmax_with_cross_entropy removing
            # the enforcement that the last dimension of label must be 1.
            tgt_label = layers.label_smooth(label=layers.one_hot(
                input=tgt_label, depth=self.voc_size),
                                            epsilon=self._label_smooth_eps)

        cost = layers.softmax_with_cross_entropy(
            logits=dec_output,
            label=tgt_label,
            soft_label=True if self._label_smooth_eps else False)

        weighted_cost = layers.elementwise_mul(x=cost, y=label_weights, axis=0)
        sum_cost = layers.reduce_sum(weighted_cost)
        token_num = layers.reduce_sum(label_weights)
        token_num.stop_gradient = True
        avg_cost = sum_cost / token_num

        graph_vars = {
            "loss": avg_cost,
            "sum_correct": sum_correct,
            "token_num": token_num,
        }
        for k, v in graph_vars.items():
            v.persistable = True

        return graph_vars
Ejemplo n.º 9
0
def seq2seq(model, tokenizer, args):
    log.info('Training starts with args: %r' % args)
    attn_id = tokenizer.vocab[args.attn_token]

    def gen_mask(batch_ids, mask_type='bidi', query_len=None, pad_value=0):
        if query_len is None:
            query_len = batch_ids.shape[1]
        if mask_type != 'empty':
            mask = (batch_ids != pad_value).astype(np.float32)
            mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1])
            if mask_type == 'causal':
                assert query_len == batch_ids.shape[1]
                mask = np.tril(mask)
            elif mask_type == 'causal_without_diag':
                assert query_len == batch_ids.shape[1]
                mask = np.tril(mask, -1)
            elif mask_type == 'diag':
                assert query_len == batch_ids.shape[1]
                mask = np.stack([np.diag(np.diag(m)) for m in mask], 0)
        else:
            mask_type == 'empty'
            mask = np.zeros_like(batch_ids).astype(np.float32)
            mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1])
        return mask

    def make_some_noice(ids):
        if args.use_random_noice:
            noice_ids = np.random.randint(1,
                                          len(tokenizer.vocab),
                                          size=ids.shape)
        else:
            noice_ids = np.ones_like(ids) * tokenizer.vocab['[NOISE]']
        pos, = np.where(np.ones_like(ids))
        np.random.shuffle(pos)
        pos = pos[:int(args.noise_prob * len(pos))]
        ids[pos, ] = noice_ids[pos, ]
        return ids

    def map_fn(example_id, src_ids, tgt_ids):
        src_ids = src_ids[:args.max_encode_len]
        tgt_ids = tgt_ids[:args.max_decode_len]
        src_ids, src_sids = tokenizer.build_for_ernie(src_ids)
        src_pids = np.arange(len(src_ids))

        tgt_ids, tgt_sids = tokenizer.build_for_ernie(tgt_ids)
        tgt_pids = np.arange(len(tgt_ids)) + len(src_ids)  # continues position
        tgt_sids = np.ones_like(tgt_sids) * args.tgt_type_id

        attn_ids = np.ones_like(tgt_ids) * attn_id
        if args.noise_prob > 0.:
            tgt_labels = deepcopy(tgt_ids)
            tgt_ids = make_some_noice(tgt_ids)  #corrupted
        else:
            tgt_labels = tgt_ids

        return (example_id, src_ids, src_pids, src_sids, tgt_ids, tgt_pids,
                tgt_sids, attn_ids, tgt_labels)

    def after_padding(example_id, src_ids, src_pids, src_sids, tgt_ids,
                      tgt_pids, tgt_sids, attn_ids, tgt_labels):
        '''
        attention mask:
        ***  src,  tgt, attn
        src  00,   01,   11
        tgt  10,   11,   12
        attn 20,   21,   22

        ***   s1, s2 | t1 t2 t3| attn1 attn2 attn3
        s1    1,  1  | 0, 0, 0,| 0,    0,    0,
        s2    1,  1  | 0, 0, 0,| 0,    0,    0,
        -
        t1    1,  1, | 1, 0, 0,| 0,    0,    0,
        t2    1,  1, | 1, 1, 0,| 0,    0,    0,
        t3    1,  1, | 1, 1, 1,| 0,    0,    0,
        -
        attn1 1,  1, | 0, 0, 0,| 1,    0,    0,
        attn2 1,  1, | 1, 0, 0,| 0,    1,    0,
        attn3 1,  1, | 1, 1, 0,| 0,    0,    1,

        for details, see Fig3. https://arxiv.org/abs/2001.11314
        '''

        src_len = src_ids.shape[1]
        tgt_len = tgt_ids.shape[1]
        mask_00 = gen_mask(src_ids, 'bidi', query_len=src_len)
        mask_01 = gen_mask(tgt_ids, 'empty', query_len=src_len)
        mask_02 = gen_mask(attn_ids, 'empty', query_len=src_len)

        mask_10 = gen_mask(src_ids, 'bidi', query_len=tgt_len)
        mask_11 = gen_mask(tgt_ids, 'causal', query_len=tgt_len)
        mask_12 = gen_mask(attn_ids, 'empty', query_len=tgt_len)

        mask_20 = gen_mask(src_ids, 'bidi', query_len=tgt_len)
        mask_21 = gen_mask(tgt_ids, 'causal_without_diag', query_len=tgt_len)
        mask_22 = gen_mask(attn_ids, 'diag', query_len=tgt_len)
        '''
        mask = np.concatenate([
            np.concatenate([mask_00, mask_01, mask_02], 2),
            np.concatenate([mask_10, mask_11, mask_12], 2),
            np.concatenate([mask_20, mask_21, mask_22], 2),
        ], 1)

        ids = np.concatenate([src_ids, tgt_ids, attn_ids], 1)
        pids = np.concatenate([src_pids, tgt_pids, tgt_pids], 1)
        sids = np.concatenate([src_sids, tgt_sids, tgt_sids], 1)

        '''

        mask_src_2_src = mask_00
        mask_tgt_2_srctgt = np.concatenate([mask_10, mask_11], 2)
        mask_attn_2_srctgtattn = np.concatenate([mask_20, mask_21, mask_22], 2)

        tgt_labels = tgt_labels[np.where(tgt_labels != 0)]
        return (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids,
                tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt,
                mask_attn_2_srctgtattn, tgt_labels)

    bytes_vocab = {k.encode('utf8'): v for k, v in tokenizer.vocab.items()}
    feature_column = propeller.data.FeatureColumns([
        propeller.data.LabelColumn('id'),
        propeller.data.TextColumn('src',
                                  unk_id=tokenizer.unk_id,
                                  vocab_dict=bytes_vocab),
        propeller.data.TextColumn('tgt',
                                  unk_id=tokenizer.unk_id,
                                  vocab_dict=bytes_vocab),
    ])

    train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=False, repeat=True, use_gz=False) \
                                   .map(map_fn)

    dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \
                                   .map(map_fn) \
                                   .padded_batch(args.eval_bsz) \
                                   .map(after_padding)

    log.debug('shard %d of %d' %
              (D.parallel.Env().dev_id, D.parallel.Env().nranks))
    train_ds = train_ds.shard(
        D.parallel.Env().nranks,
        D.parallel.Env().dev_id).shuffle(10000).padded_batch(
            args.bsz).map(after_padding)
    dev_ds = dev_ds.shard(D.parallel.Env().nranks, D.parallel.Env().dev_id)

    shapes = [[None, None]] * 7 + [[None, None, None]] * 3 + [[None]]
    types = ['int64'] * 11

    train_ds.data_shapes = shapes
    train_ds.data_types = types
    dev_ds.data_shapes = shapes
    dev_ds.data_types = types

    vocab_size, _ = model.word_emb.weight.shape
    ctx = D.parallel.prepare_context()
    model = D.parallel.DataParallel(model, ctx)
    g_clip = F.clip.GradientClipByGlobalNorm(1.0)
    opt = AdamW(learning_rate=LinearDecay(
        args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps),
                parameter_list=model.parameters(),
                weight_decay=args.wd,
                grad_clip=g_clip)
    attn_id = tokenizer.vocab[args.attn_token]
    for step, data in enumerate(train_ds.start(place)):
        (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids,
         attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn,
         tgt_labels) = data

        _, __, info = model(src_ids,
                            sent_ids=src_sids,
                            pos_ids=src_pids,
                            attn_bias=mask_src_2_src,
                            encode_only=True)
        cached_k, cached_v = info['caches']
        _, __, info = model(tgt_ids,
                            sent_ids=tgt_sids,
                            pos_ids=tgt_pids,
                            attn_bias=mask_tgt_2_srctgt,
                            past_cache=(cached_k, cached_v),
                            encode_only=True)
        cached_k2, cached_v2 = info['caches']
        past_cache_k = [
            L.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2)
        ]
        past_cache_v = [
            L.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2)
        ]
        if args.label_smooth > 0.:
            tgt_labels = L.label_smooth(F.one_hot(tgt_labels, vocab_size),
                                        epsilon=args.label_smooth)
        loss, _, __ = model(attn_ids,
                            sent_ids=tgt_sids,
                            pos_ids=tgt_pids,
                            attn_bias=mask_attn_2_srctgtattn,
                            past_cache=(past_cache_k, past_cache_v),
                            tgt_labels=tgt_labels,
                            tgt_pos=L.where(attn_ids == attn_id))

        scaled_loss = model.scale_loss(loss)
        scaled_loss.backward()
        model.apply_collective_grads()
        opt.minimize(scaled_loss)
        model.clear_gradients()
        if step % 10 == 0:
            loss = loss.numpy()
            ppl = np.exp(loss)
            log.debug('[step %d]train loss %.5f, ppl %.5f, lr %.3e' %
                      (step, loss, ppl, opt.current_step_lr()))
        if args.save_dir is not None and step % 1000 == 0 and D.parallel.Env(
        ).dev_id == 0:
            F.save_dygraph(model.state_dict(), args.save_dir)
        if args.predict_output_dir is not None and step > args.skip_eval_steps and step % args.eval_steps == 0:
            assert os.path.exists(
                args.predict_output_dir
            ), 'predict_output_dir not found: %s' % args.predict_output_dir
            log.debug('doing predict on gpu %d...' % D.parallel.Env().dev_id)
            evaluate(model, dev_ds, step, args)
        if step > args.max_steps:
            break
    evaluate(model, dev_ds, step, args)

    if args.save_dir is not None:
        F.save_dygraph(model.state_dict(), args.save_dir)
Ejemplo n.º 10
0
    def finetune(
            self,
            train_path,
            dev_path=None,
            save_dir="ernie_gen_result",
            init_ckpt_path=None,
            use_gpu=True,
            max_steps=500,
            batch_size=8,
            max_encode_len=50,
            max_decode_len=50,
            learning_rate=5e-5,
            warmup_proportion=0.1,
            weight_decay=0.1,
            noise_prob=0,
            label_smooth=0,
            beam_width=5,
            length_penalty=1.0,
            log_interval=100,
            save_interval=200,
    ):
        """
        finetune with the specified dataset.

        Args:
            train_path(str): the train dataset path.
            dev_path(str): the dev dataset path.
            save_dir(str): the model params and dev dataset predict result save path.
            init_ckpt_path(str): incremental training load path.
            use_gpu(bool): use gpu or not.
            max_steps(int): max training steps.
            batch_size(int): the batch size.
            max_encode_len(int): the max encode length.
            max_decode_len(int): the max decode length.
            learning_rate(float): the learning rate.
            warmup_proportion(float): the warmup proportion.
            weight_decay(float): the weight decay magnitude.
            noise_prob(float): the nosie probability. see the ernie gen paper for details.
            label_smooth(float): the label smooth magnitude.
            beam_width(int): the beam size during evaluating the dev dataset.
            length_penalty(float): the length penalty during evaluating the dev dataset.
            log_interval(int): the log interval.
            save_interval(int): the save interval. dev set will be evaluated after saving.

        Return:
            result(dict): A Dictionary of shape::
                {
                    last_save_path(str): last model save path.
                    last_ppl(float): last model ppl.
                }
        """
        self.max_encode_len = max_encode_len
        self.max_decode_len = max_decode_len
        self.noise_prob = noise_prob

        place = F.CUDAPlace(0) if use_gpu else F.CPUPlace()

        with F.dygraph.guard(place):
            if init_ckpt_path is not None:
                logger.info('loading checkpoint from %s' % init_ckpt_path)
                sd, _ = D.load_dygraph(init_ckpt_path)
                self.model.set_dict(sd)

            feature_column = propeller.data.FeatureColumns([
                propeller.data.LabelColumn('id'),
                propeller.data.TextColumn(
                    'src',
                    unk_id=self.tokenizer.unk_id,
                    vocab_dict=self.tokenizer.vocab,
                    tokenizer=self.tokenizer.tokenize),
                propeller.data.TextColumn(
                    'tgt',
                    unk_id=self.tokenizer.unk_id,
                    vocab_dict=self.tokenizer.vocab,
                    tokenizer=self.tokenizer.tokenize),
            ])

            train_ds = feature_column.build_dataset('train', data_file=train_path, shuffle=False,
                                                    repeat=True, use_gz=False)\
                .map(self._map_fn).shuffle(10000).padded_batch(batch_size).map(self._after_padding)
            train_ds.data_shapes = [[None, None]] * 7 + [[None, None, None]
                                                         ] * 3 + [[None]]
            train_ds.data_types = ['int64'] * 11

            if dev_path:
                dev_ds = feature_column.build_dataset('dev', data_file=dev_path, shuffle=False,
                                                    repeat=False, use_gz=False) \
                    .map(self._map_fn) \
                    .padded_batch(1) \
                    .map(self._after_padding)
                dev_ds.data_shapes = [[None, None]] * 7 + [[None, None, None]
                                                           ] * 3 + [[None]]
                dev_ds.data_types = ['int64'] * 11

            vocab_size, _ = self.model.word_emb.weight.shape
            g_clip = F.clip.GradientClipByGlobalNorm(1.0)
            opt = AdamW(
                learning_rate=LinearDecay(learning_rate,
                                          int(warmup_proportion * max_steps),
                                          max_steps),
                parameter_list=self.model.parameters(),
                weight_decay=weight_decay,
                grad_clip=g_clip)

            loss = None

            save_path = None
            ppl = None

            if save_dir and not os.path.exists(save_dir):
                os.makedirs(save_dir)
            for step, data in enumerate(train_ds.start(place)):
                (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids,
                 tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt,
                 mask_attn_2_srctgtattn, tgt_labels) = data

                _, __, info = self.model(
                    src_ids,
                    sent_ids=src_sids,
                    pos_ids=src_pids,
                    attn_bias=mask_src_2_src,
                    encode_only=True)
                cached_k, cached_v = info['caches']
                _, __, info = self.model(
                    tgt_ids,
                    sent_ids=tgt_sids,
                    pos_ids=tgt_pids,
                    attn_bias=mask_tgt_2_srctgt,
                    past_cache=(cached_k, cached_v),
                    encode_only=True)
                cached_k2, cached_v2 = info['caches']
                past_cache_k = [
                    L.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2)
                ]
                past_cache_v = [
                    L.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2)
                ]
                if label_smooth > 0.:
                    tgt_labels = L.label_smooth(
                        F.one_hot(tgt_labels, vocab_size), epsilon=label_smooth)
                loss, _, __ = self.model(
                    attn_ids,
                    sent_ids=tgt_sids,
                    pos_ids=tgt_pids,
                    attn_bias=mask_attn_2_srctgtattn,
                    past_cache=(past_cache_k, past_cache_v),
                    tgt_labels=tgt_labels,
                    tgt_pos=L.where(attn_ids == self.tokenizer.vocab['[MASK]']))

                loss.backward()
                opt.minimize(loss)
                self.model.clear_gradients()

                if step % log_interval == 0:
                    loss_np = loss.numpy()
                    ppl = np.exp(loss_np)
                    logger.info(
                        '[step %d / %d]train loss %.5f, ppl %.5f, elr %.3e' %
                        (step, max_steps, loss_np, ppl, opt.current_step_lr()))
                if save_dir and step % save_interval == 0 and step > 0:
                    loss_np = loss.numpy()
                    ppl = np.exp(loss_np)
                    save_name = "step_%s_ppl_%.5f" % (step, ppl)
                    save_path = os.path.join(save_dir, save_name)
                    logger.info("save the model in %s" % save_path)
                    F.save_dygraph(self.model.state_dict(), save_path)

                    if dev_path:
                        logger.info('evaluating...')
                        res = self._evaluate(dev_ds, place, beam_width,
                                             length_penalty)
                        output_path = os.path.join(
                            save_dir, "step_%s_ppl_%.5f.txt" % (step, ppl))
                        logger.info(
                            'save the predict result in %s' % output_path)
                        with open(output_path, 'w') as fout:
                            fout.write(('\n'.join(res)))

                if step > max_steps:
                    break

            if loss:
                loss_np = loss.numpy()
                ppl = np.exp(loss_np)
                logger.info('[final step %d]train loss %.5f, ppl %.5f, elr %.3e'
                            % (step, loss_np, ppl, opt.current_step_lr()))
                if save_dir:
                    save_name = "step_%s_ppl_%.5f" % (step, ppl)
                    save_path = os.path.join(save_dir, save_name)
                    logger.info("save the model in %s" % save_path)
                    F.save_dygraph(self.model.state_dict(), save_path)

                    if dev_path:
                        logger.info('evaluating...')
                        res = self._evaluate(dev_ds, place, beam_width,
                                             length_penalty)
                        output_path = os.path.join(
                            save_dir, "step_%s_ppl_%.5f.txt" % (step, ppl))
                        logger.info(
                            'save the predict result in %s' % output_path)
                        with open(output_path, 'w') as fout:
                            fout.write(('\n'.join(res)))

            result = {
                "last_save_path": "%s.pdparams" % save_path,
                "last_ppl": ppl[0],
            }

            return result
Ejemplo n.º 11
0
def transformer(src_vocab_size,
                trg_vocab_size,
                max_length,
                n_layer,
                n_head,
                d_key,
                d_value,
                d_model,
                d_inner_hid,
                prepostprocess_dropout,
                attention_dropout,
                relu_dropout,
                preprocess_cmd,
                postprocess_cmd,
                weight_sharing,
                label_smooth_eps,
                bos_idx=0,
                use_py_reader=False,
                is_test=False):
    if weight_sharing:
        assert src_vocab_size == trg_vocab_size, (
            "Vocabularies in source and target should be same for weight sharing."
        )

    data_input_names = encoder_data_input_fields + \
                decoder_data_input_fields[:-1] + label_data_input_fields

    if use_py_reader:
        all_inputs, reader = make_all_py_reader_inputs(data_input_names,
                                                       is_test)
    else:
        all_inputs = make_all_inputs(data_input_names)
    # print("all inputs",all_inputs)
    enc_inputs_len = len(encoder_data_input_fields)
    dec_inputs_len = len(decoder_data_input_fields[:-1])
    enc_inputs = all_inputs[0:enc_inputs_len]
    dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + dec_inputs_len]
    label = all_inputs[-2]
    weights = all_inputs[-1]

    enc_output = wrap_encoder(src_vocab_size, 64, n_layer, n_head, d_key,
                              d_value, d_model, d_inner_hid,
                              prepostprocess_dropout, attention_dropout,
                              relu_dropout, preprocess_cmd, postprocess_cmd,
                              weight_sharing, enc_inputs)

    predict = wrap_decoder(
        trg_vocab_size,
        max_length,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        prepostprocess_dropout,
        attention_dropout,
        relu_dropout,
        preprocess_cmd,
        postprocess_cmd,
        weight_sharing,
        dec_inputs,
        enc_output,
    )

    # Padding index do not contribute to the total loss. The weights is used to
    # cancel padding index in calculating the loss.
    if label_smooth_eps:
        label = layers.label_smooth(label=layers.one_hot(input=label,
                                                         depth=trg_vocab_size),
                                    epsilon=label_smooth_eps)

    cost = layers.softmax_with_cross_entropy(
        logits=predict,
        label=label,
        soft_label=True if label_smooth_eps else False)
    weighted_cost = cost * weights
    sum_cost = layers.reduce_sum(weighted_cost)
    token_num = layers.reduce_sum(weights)
    token_num.stop_gradient = True
    avg_cost = sum_cost / token_num
    return sum_cost, avg_cost, predict, token_num, reader if use_py_reader else None
Ejemplo n.º 12
0
def transformer(model_input,
                src_vocab_size,
                trg_vocab_size,
                max_length,
                n_layer,
                n_head,
                d_key,
                d_value,
                d_model,
                d_inner_hid,
                prepostprocess_dropout,
                attention_dropout,
                relu_dropout,
                preprocess_cmd,
                postprocess_cmd,
                weight_sharing,
                label_smooth_eps,
                bos_idx=0,
                is_test=False):
    if weight_sharing:
        assert src_vocab_size == trg_vocab_size, (
            "Vocabularies in source and target should be same for weight sharing."
        )

    enc_inputs = (model_input.src_word, model_input.src_pos,
                  model_input.src_slf_attn_bias)
    dec_inputs = (model_input.trg_word, model_input.trg_pos,
                  model_input.trg_slf_attn_bias, model_input.trg_src_attn_bias)
    label = model_input.lbl_word
    weights = model_input.lbl_weight

    enc_output = wrap_encoder(enc_inputs,
                              src_vocab_size,
                              max_length,
                              n_layer,
                              n_head,
                              d_key,
                              d_value,
                              d_model,
                              d_inner_hid,
                              prepostprocess_dropout,
                              attention_dropout,
                              relu_dropout,
                              preprocess_cmd,
                              postprocess_cmd,
                              weight_sharing,
                              bos_idx=bos_idx)

    predict = wrap_decoder(dec_inputs,
                           trg_vocab_size,
                           max_length,
                           n_layer,
                           n_head,
                           d_key,
                           d_value,
                           d_model,
                           d_inner_hid,
                           prepostprocess_dropout,
                           attention_dropout,
                           relu_dropout,
                           preprocess_cmd,
                           postprocess_cmd,
                           weight_sharing,
                           enc_output=enc_output)

    # Padding index do not contribute to the total loss. The weights is used to
    # cancel padding index in calculating the loss.
    if label_smooth_eps:
        # TODO: use fluid.input.one_hot after softmax_with_cross_entropy removing
        # the enforcement that the last dimension of label must be 1.
        label = layers.label_smooth(label=layers.one_hot(input=label,
                                                         depth=trg_vocab_size),
                                    epsilon=label_smooth_eps)

    cost = layers.softmax_with_cross_entropy(
        logits=predict,
        label=label,
        soft_label=True if label_smooth_eps else False)
    weighted_cost = layers.elementwise_mul(x=cost, y=weights, axis=0)
    sum_cost = layers.reduce_sum(weighted_cost)
    token_num = layers.reduce_sum(weights)
    token_num.stop_gradient = True
    avg_cost = sum_cost / token_num
    return sum_cost, avg_cost, predict, token_num
Ejemplo n.º 13
0
def transformer(src_vocab_size,
                trg_vocab_size,
                max_length,
                n_layer,
                n_head,
                d_key,
                d_value,
                d_model,
                d_inner_hid,
                prepostprocess_dropout,
                attention_dropout,
                relu_dropout,
                preprocess_cmd,
                postprocess_cmd,
                weight_sharing,
                label_smooth_eps,
                bos_idx=0,
                is_test=False,
                model_input=None):
    """
        transformer main
    """
    if weight_sharing:
        assert src_vocab_size == trg_vocab_size, (
            "Vocabularies in source and target should be same for weight sharing."
        )
    enc_inputs = (model_input.src_word, model_input.src_pos,
                  model_input.src_slf_attn_bias)
    dec_inputs = (model_input.trg_word, model_input.trg_pos,
                  model_input.trg_slf_attn_bias, model_input.trg_src_attn_bias)
    label = model_input.lbl_word
    weights = model_input.lbl_weight

    enc_output = wrap_encoder(src_vocab_size,
                              max_length,
                              n_layer,
                              n_head,
                              d_key,
                              d_value,
                              d_model,
                              d_inner_hid,
                              prepostprocess_dropout,
                              attention_dropout,
                              relu_dropout,
                              preprocess_cmd,
                              postprocess_cmd,
                              weight_sharing,
                              enc_inputs,
                              bos_idx=bos_idx)

    predict = wrap_decoder(
        trg_vocab_size,
        max_length,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        prepostprocess_dropout,
        attention_dropout,
        relu_dropout,
        preprocess_cmd,
        postprocess_cmd,
        weight_sharing,
        dec_inputs,
        enc_output,
    )

    # Padding index do not contribute to the total loss. The weights is used to
    # cancel padding index in calculating the loss.
    if label_smooth_eps:
        label = layers.label_smooth(label=layers.one_hot(input=label,
                                                         depth=trg_vocab_size),
                                    epsilon=label_smooth_eps)

    cost = layers.softmax_with_cross_entropy(
        logits=predict,
        label=label,
        soft_label=True if label_smooth_eps else False)
    weighted_cost = cost * weights
    sum_cost = layers.reduce_sum(weighted_cost)
    token_num = layers.reduce_sum(weights)
    token_num.stop_gradient = True
    avg_cost = sum_cost / token_num
    res = [sum_cost, avg_cost, predict, token_num]
    return res