Beispiel #1
0
def predict(model, data, tokenizer, label_map, batch_size=1):
    """
    Predicts the data labels.

    Args:
        model (obj:`paddle.nn.Layer`): A model to classify texts.
        data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object.
            A Example object contains `text`(word_ids) and `seq_len`(sequence length).
        tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 
            which contains most of the methods. Users should refer to the superclass for more information regarding methods.
        label_map(obj:`dict`): The label id (key) to label str (value) map.
        batch_size(obj:`int`, defaults to 1): The number of batch.

    Returns:
        results(obj:`dict`): All the predictions labels.
    """
    examples = []
    for text in data:
        input_ids, segment_ids = convert_example(
            text,
            tokenizer,
            label_list=label_map.values(),
            max_seq_length=args.max_seq_length,
            is_test=True)
        examples.append((input_ids, segment_ids))

    # Seperates data into some batches.
    batches = [
        examples[idx:idx + batch_size]
        for idx in range(0, len(examples), batch_size)
    ]
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
    ): fn(samples)

    results = []
    model.eval()
    for batch in batches:
        input_ids, segment_ids = batchify_fn(batch)
        input_ids = paddle.to_tensor(input_ids)
        segment_ids = paddle.to_tensor(segment_ids)
        logits = model(input_ids, segment_ids)
        probs = F.softmax(logits, axis=1)
        idx = paddle.argmax(probs, axis=1).numpy()
        idx = idx.tolist()
        labels = [label_map[i] for i in idx]
        results.extend(labels)
    return results
Beispiel #2
0
    def build_dataset(index, name, num_samples):
        dataset = GPTDataset(file_prefix=input_prefix,
                             build_data_file=local_rank == 0,
                             micro_batch_size=args.micro_batch_size,
                             name="gpt_" + name,
                             max_seq_len=max_seq_len,
                             num_samples=num_samples,
                             documents=np.arange(splits[index],
                                                 splits[index + 1]),
                             sample_ids=sample_ids,
                             sample_lens=sample_lens,
                             eos_id=eos_id,
                             seed=args.seed,
                             use_pure_fp16=args.use_amp
                             and args.amp_level == "O2",
                             data_holders=data_holders)
        batch_sampler = DistributedBatchSampler(
            dataset,
            batch_size=args.micro_batch_size,
            num_replicas=data_world_size,
            rank=data_world_rank,
            shuffle=False,
            drop_last=True)

        if pipeline_mode:

            def data_gen():
                for data in dataset:
                    yield tuple(
                        [np.expand_dims(np.array(x), axis=0) for x in data])

            data_loader = paddle.fluid.io.DataLoader.from_generator(
                feed_list=data_holders, capacity=70, iterable=False)
            data_loader.set_sample_generator(data_gen,
                                             batch_size=args.micro_batch_size,
                                             places=places)
        else:
            stacks = (Stack(), ) * len(data_holders)
            collate_fn = Tuple(*stacks)
            data_loader = DataLoader(dataset=dataset,
                                     places=places,
                                     feed_list=data_holders,
                                     batch_sampler=batch_sampler,
                                     num_workers=1,
                                     worker_init_fn=worker_init,
                                     collate_fn=collate_fn,
                                     return_list=False)
        return data_loader
Beispiel #3
0
    def predict(self, data, tokenizer):
        """
        Predicts the data labels.

        Args:
            data (obj:`List(str)`): The batch data whose each element is a raw text.
            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 
                which contains most of the methods. Users should refer to the superclass for more information regarding methods.

        Returns:
            results(obj:`dict`): All the predictions labels.
        """
        if args.benchmark:
            self.autolog.times.start()

        examples = []
        for text in data:
            input_ids, segment_ids = convert_example(text,
                                                     tokenizer,
                                                     is_test=True)
            examples.append((input_ids, segment_ids))

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
                ),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
                ),  # segment
        ): fn(samples)

        if args.benchmark:
            self.autolog.times.stamp()

        input_ids, segment_ids = batchify_fn(examples)
        self.input_handles[0].copy_from_cpu(input_ids)
        self.input_handles[1].copy_from_cpu(segment_ids)
        self.predictor.run()
        logits = self.output_handle.copy_to_cpu()
        if args.benchmark:
            self.autolog.times.stamp()

        probs = softmax(logits, axis=1)
        idx = np.argmax(probs, axis=1)
        idx = idx.tolist()

        if args.benchmark:
            self.autolog.times.end(stamp=True)

        return probs
Beispiel #4
0
def create_data_loader(args):
    # Create dataset.
    train_ds, test_ds = load_dataset(
        datafiles=(os.path.join(args.data_dir, 'train.tsv'),
                   os.path.join(args.data_dir, 'test.tsv')))

    word_vocab = load_vocab(os.path.join(args.data_dir, 'word.dic'))
    label_vocab = load_vocab(os.path.join(args.data_dir, 'tag.dic'))
    # q2b.dic is used to replace DBC case to SBC case
    normlize_vocab = load_vocab(os.path.join(args.data_dir, 'q2b.dic'))

    trans_func = partial(convert_example,
                         max_seq_len=args.max_seq_len,
                         word_vocab=word_vocab,
                         label_vocab=label_vocab,
                         normlize_vocab=normlize_vocab)
    train_ds.map(trans_func)
    test_ds.map(trans_func)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=word_vocab.get("[PAD]", 0), dtype='int64'
            ),  # word_ids
        Stack(dtype='int64'),  # length
        Pad(axis=0, pad_val=label_vocab.get("O", 0), dtype='int64'
            ),  # label_ids
    ): fn(samples)

    # Create sampler for dataloader
    train_sampler = paddle.io.DistributedBatchSampler(
        dataset=train_ds,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True)
    train_loader = paddle.io.DataLoader(dataset=train_ds,
                                        batch_sampler=train_sampler,
                                        return_list=True,
                                        collate_fn=batchify_fn)

    test_sampler = paddle.io.BatchSampler(dataset=test_ds,
                                          batch_size=args.batch_size,
                                          shuffle=False,
                                          drop_last=False)
    test_loader = paddle.io.DataLoader(dataset=test_ds,
                                       batch_sampler=test_sampler,
                                       return_list=True,
                                       collate_fn=batchify_fn)

    return word_vocab, label_vocab, train_loader, test_loader
Beispiel #5
0
    def predict(self, data, tokenizer, batch_size=1, threshold=0.5):
        """
        Predicts the data labels.

        Args:
            data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object.
                A Example object contains `text`(word_ids) and `se_len`(sequence length).
            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 
                which contains most of the methods. Users should refer to the superclass for more information regarding methods.
            batch_size(obj:`int`, defaults to 1): The number of batch.
            threshold(obj:`int`, defaults to 0.5): The threshold for converting probabilities to labels.

        Returns:
            results(obj:`dict`): All the predictions labels.
        """
        examples = []
        for text in data:
            example = {"text": text}
            input_ids, segment_ids = convert_example(
                example,
                tokenizer,
                max_seq_length=self.max_seq_length,
                is_test=True)
            examples.append((input_ids, segment_ids))

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
        ): fn(samples)

        # Seperates data into some batches.
        batches = [
            examples[idx:idx + batch_size]
            for idx in range(0, len(examples), batch_size)
        ]

        results = []
        for batch in batches:
            input_ids, segment_ids = batchify_fn(batch)
            self.input_handles[0].copy_from_cpu(input_ids)
            self.input_handles[1].copy_from_cpu(segment_ids)
            self.predictor.run()
            logits = paddle.to_tensor(self.output_handle.copy_to_cpu())
            probs = F.sigmoid(logits)
            preds = (probs.numpy() > threshold).astype(int)
            results.extend(preds)
        return results
Beispiel #6
0
def predict_cls(args, ext_results):
    # load dict
    model_name = "skep_ernie_1.0_large_ch"
    cls_label2id, cls_id2label = load_dict(args.cls_label_path)

    tokenizer = SkepTokenizer.from_pretrained(model_name)
    test_ds = MapDataset(ext_results)
    trans_func = partial(convert_example_to_feature_cls,
                         tokenizer=tokenizer,
                         label2id=cls_label2id,
                         max_seq_len=args.cls_max_seq_len,
                         is_test=True)
    test_ds = test_ds.map(trans_func, lazy=False)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id), Stack(dtype="int64")
    ): fn(samples)

    # set shuffle is False
    test_batch_sampler = paddle.io.BatchSampler(test_ds,
                                                batch_size=args.batch_size,
                                                shuffle=False)
    test_loader = paddle.io.DataLoader(test_ds,
                                       batch_sampler=test_batch_sampler,
                                       collate_fn=batchify_fn)
    print("test data loaded.")

    # load cls model
    cls_state_dict = paddle.load(args.cls_model_path)
    cls_model = SkepForSequenceClassification.from_pretrained(
        model_name, num_classes=len(cls_label2id))
    cls_model.load_dict(cls_state_dict)
    print("classification model loaded.")

    cls_model.eval()

    results = []
    for bid, batch_data in enumerate(test_loader):
        input_ids, token_type_ids, seq_lens = batch_data
        logits = cls_model(input_ids, token_type_ids=token_type_ids)

        predictions = logits.argmax(axis=1).numpy().tolist()
        results.extend(predictions)

    results = [cls_id2label[pred_id] for pred_id in results]
    return results
Beispiel #7
0
    def predict(self, data, tokenizer):
        """
        Predicts the data labels.

        Args:
            data (obj:`List(str)`): The batch data whose each element is a raw text.
            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 
                which contains most of the methods. Users should refer to the superclass for more information regarding methods.

        Returns:
            results(obj:`dict`): All the predictions labels.
        """

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
        ): fn(samples)

        all_embeddings = []
        examples = []
        for idx, text in enumerate(tqdm(data)):
            input_ids, segment_ids = convert_example(
                text,
                tokenizer,
                max_seq_length=self.max_seq_length,
                pad_to_max_seq_len=True)
            examples.append((input_ids, segment_ids))
            if (len(examples) >= 100):
                input_ids, segment_ids = batchify_fn(examples)
                self.input_handles[0].copy_from_cpu(input_ids)
                self.input_handles[1].copy_from_cpu(segment_ids)
                self.predictor.run()
                logits = self.output_handle.copy_to_cpu()
                all_embeddings.append(logits)
                examples = []

        if (len(examples) > 0):
            input_ids, segment_ids = batchify_fn(examples)
            self.input_handles[0].copy_from_cpu(input_ids)
            self.input_handles[1].copy_from_cpu(segment_ids)
            self.predictor.run()
            logits = self.output_handle.copy_to_cpu()
            all_embeddings.append(logits)

        all_embeddings = np.concatenate(all_embeddings, axis=0)
        np.save('corpus_embedding', all_embeddings)
Beispiel #8
0
def create_pretrained_dataset(args, input_path, worker_init, worker_index,
                              eod_id):
    train_data = GPT2Dataset(file_path=input_path,
                             worker_index=worker_index,
                             num_samples=args.batch_size * args.max_steps,
                             eod_id=eod_id,
                             seed=args.seed + worker_index)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_data, batch_size=args.batch_size, shuffle=True, drop_last=True)

    train_data_loader = DataLoader(dataset=train_data,
                                   batch_sampler=train_batch_sampler,
                                   num_workers=0,
                                   worker_init_fn=worker_init,
                                   collate_fn=Tuple(Stack(), Stack(), Stack(),
                                                    Stack(), Stack()))
    return train_data_loader
Beispiel #9
0
def create_data_loader_for_small_model(task_name,
                                       vocab_path,
                                       model_name=None,
                                       batch_size=64,
                                       max_seq_length=128,
                                       shuffle=True):
    """Data loader for bi-lstm, not bert."""
    if task_name == 'chnsenticorp':
        train_ds, dev_ds = load_dataset(task_name, splits=["train", "dev"])
    else:
        train_ds, dev_ds = load_dataset('glue',
                                        task_name,
                                        splits=["train", "dev"])
    if task_name == 'chnsenticorp':
        vocab = Vocab.load_vocabulary(
            vocab_path,
            unk_token='[UNK]',
            pad_token='[PAD]',
            bos_token=None,
            eos_token=None,
        )
        pad_val = vocab['[PAD]']

    else:
        vocab = BertTokenizer.from_pretrained(model_name)
        pad_val = vocab.pad_token_id

    trans_fn = partial(convert_small_example,
                       task_name=task_name,
                       vocab=vocab,
                       max_seq_length=max_seq_length,
                       is_test=False)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=pad_val),  # input_ids
        Stack(dtype="int64"),  # seq len
        Stack(dtype="int64")  # label
    ): fn(samples)

    train_ds = train_ds.map(trans_fn, lazy=True)
    dev_ds = dev_ds.map(trans_fn, lazy=True)

    train_data_loader, dev_data_loader = create_dataloader(
        train_ds, dev_ds, batch_size, batchify_fn, shuffle)

    return train_data_loader, dev_data_loader
Beispiel #10
0
def build_data_loader(args, tokenizer):
    """ build corpus_data_loader and text_data_loader
    """

    id2corpus = gen_id2corpus(args.corpus_file)

    # conver_example function's input must be dict
    corpus_list = [{idx: text} for idx, text in id2corpus.items()]
    corpus_ds = MapDataset(corpus_list)

    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         max_seq_length=args.max_seq_length)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # text_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # text_segment
    ): [data for data in fn(samples)]

    corpus_data_loader = create_dataloader(corpus_ds,
                                           mode='predict',
                                           batch_size=args.batch_size,
                                           batchify_fn=batchify_fn,
                                           trans_fn=trans_func)

    # build text data_loader
    text_list, text2similar_text = gen_text_file(args.similar_text_pair_file)

    text_ds = MapDataset(text_list)

    text_data_loader = create_dataloader(text_ds,
                                         mode='predict',
                                         batch_size=args.batch_size,
                                         batchify_fn=batchify_fn,
                                         trans_fn=trans_func)

    d = {
        "text_data_loader": text_data_loader,
        "corpus_data_loader": corpus_data_loader,
        "id2corpus": id2corpus,
        "text2similar_text": text2similar_text,
        "text_list": text_list
    }

    return d
Beispiel #11
0
def init_lstm_var(args):
    #different language has different tokenizer
    if args.language == "ch":
        tokenizer = ErnieTokenizer.from_pretrained(args.vocab_path)
        padding_idx = tokenizer.vocab.get('[PAD]')
        tokenizer.inverse_vocab = [
            item[0]
            for item in sorted(tokenizer.vocab.items(), key=lambda x: x[1])
        ]
    else:
        vocab = Vocab.load_vocabulary(args.vocab_path,
                                      unk_token='[UNK]',
                                      pad_token='[PAD]')
        tokenizer = CharTokenizer(vocab)
        padding_idx = vocab.token_to_idx.get('[PAD]', 0)

    trans_fn = partial(convert_example,
                       tokenizer=tokenizer,
                       is_test=True,
                       language=args.language)

    #init attention layer
    lstm_hidden_size = 196
    attention = SelfInteractiveAttention(hidden_size=2 * lstm_hidden_size)
    model = BiLSTMAttentionModel(attention_layer=attention,
                                 vocab_size=len(tokenizer.vocab),
                                 lstm_hidden_size=lstm_hidden_size,
                                 num_classes=2,
                                 padding_idx=padding_idx)

    # Reads data and generates mini-batches.
    dev_ds = SentiData().read(os.path.join(args.data_dir, 'dev'),
                              args.language)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=padding_idx),  # input_ids
        Stack(dtype="int64"),  # seq len
    ): [data for data in fn(samples)]

    dev_loader = create_dataloader(dev_ds,
                                   trans_fn=trans_fn,
                                   batch_size=args.batch_size,
                                   mode='validation',
                                   batchify_fn=batchify_fn)

    return model, tokenizer, dev_loader
Beispiel #12
0
def do_predict(data,
               model,
               tokenizer,
               viterbi_decoder,
               tags_to_idx,
               idx_to_tags,
               batch_size=1,
               summary_num=2):

    examples = []
    for text in data:
        example = {"tokens": list(text)}
        input_ids, token_type_ids, seq_len = convert_example(example,
                                                             tokenizer,
                                                             args.max_seq_len,
                                                             is_test=True)

        examples.append((input_ids, token_type_ids, seq_len))

    batches = [
        examples[idx:idx + batch_size]
        for idx in range(0, len(examples), batch_size)
    ]

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'
            ),  # input_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'
            ),  # token_type_ids
        Stack(dtype='int64'),  # seq_len
    ): fn(samples)

    all_pred_tags = []

    model.eval()
    for batch in batches:
        input_ids, token_type_ids, seq_len = batchify_fn(batch)
        input_ids = paddle.to_tensor(input_ids)
        token_type_ids = paddle.to_tensor(token_type_ids)
        seq_len = paddle.to_tensor(seq_len)
        pred_tags = model(input_ids, token_type_ids, lengths=seq_len)
        all_pred_tags.extend(pred_tags.numpy().tolist())
    results = decode(data, all_pred_tags, summary_num, idx_to_tags)
    return results
Beispiel #13
0
def predict(model, data, label_map, batch_size=1, pad_token_id=0):
    """
    Predicts the data labels.

    Args:
        model (obj:`paddle.nn.Layer`): A model to classify texts.
        data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object.
            A Example object contains `text`(word_ids) and `seq_len`(sequence length).
        label_map(obj:`dict`): The label id (key) to label str (value) map.
        batch_size(obj:`int`, defaults to 1): The number of batch.
        pad_token_id(obj:`int`, optional, defaults to 0): The pad token index.

    Returns:
        results(obj:`dict`): All the predictions labels.
    """

    # Seperates data into some batches.
    batches = [
        data[idx:idx + batch_size] for idx in range(0, len(data), batch_size)
    ]

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=pad_token_id),  # query_ids
        Pad(axis=0, pad_val=pad_token_id),  # title_ids
        Stack(dtype="int64"),  # query_seq_lens
        Stack(dtype="int64"),  # title_seq_lens
    ): [data for data in fn(samples)]

    results = []
    model.eval()
    for batch in batches:
        query_ids, title_ids, query_seq_lens, title_seq_lens = batchify_fn(
            batch)
        query_ids = paddle.to_tensor(query_ids)
        title_ids = paddle.to_tensor(title_ids)
        query_seq_lens = paddle.to_tensor(query_seq_lens)
        title_seq_lens = paddle.to_tensor(title_seq_lens)
        logits = model(query_ids, title_ids, query_seq_lens, title_seq_lens)
        probs = F.softmax(logits, axis=1)
        idx = paddle.argmax(probs, axis=1).numpy()
        idx = idx.tolist()
        labels = [label_map[i] for i in idx]
        results.extend(labels)
    return results
Beispiel #14
0
def create_eval_dataset(args):
    val_dataloader = None
    eval_batch_size = args.batch_size
    seq_len = args.seq_length

    tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
    pad_token = tokenizer.command_name_map["pad"].Id

    if not args.cloze_eval:
        with open(args.eval_path, "rb") as reader:
            entire_data = reader.read().decode('utf-8')
        num_original_tokens = len(entire_data.strip().split(" "))
        entire_data = wikitext_detokenizer(entire_data)
        tokenized_data = tokenizer.encode(entire_data)
        num_tokenized_tokens = len(tokenized_data)
        print('Original Tokens: %d, Detokenized tokens: %d' %
              (num_tokenized_tokens, num_original_tokens))
        val_dataset = LM_Eval_Dataset(tokenized_data, seq_len, pad_token,
                                      args.overlapping_eval)
    else:
        tokenized_data = []
        tokenized_label = []
        with open(args.eval_path, 'r') as f:
            for line in f.readlines():
                text = json.loads(line)['text']
                tokens, labels = get_tokens(tokenizer, text)
                tokenized_data.append(tokens)
                tokenized_label.append(labels)
        val_dataset = Lambada_Eval_Dataset(tokenized_data, tokenized_label,
                                           seq_len, pad_token)
        num_tokenized_tokens = 0
        num_original_tokens = 0

    args.num_examples = len(val_dataset)
    args.num_original_tokens = num_original_tokens
    args.num_tokenized_tokens = num_tokenized_tokens
    val_dataloader = DataLoader(val_dataset,
                                batch_size=eval_batch_size,
                                drop_last=False,
                                collate_fn=Tuple(Stack(), Stack(), Stack(),
                                                 Stack(), Stack()))

    return val_dataloader
Beispiel #15
0
def do_train():
    paddle.set_device(args.device)
    rank = paddle.distributed.get_rank()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args.seed)

    dev_ds = load_dataset(read_test, src_path=args.test_file, lazy=False)
    print(dev_ds[0])

    pretrained_model = ppnlp.transformers.ErnieGramModel.from_pretrained(
        'ernie-gram-zh')
    tokenizer = ppnlp.transformers.ErnieGramTokenizer.from_pretrained(
        'ernie-gram-zh')

    trans_func_eval = partial(
        convert_example,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        phase="eval")

    batchify_fn_eval = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # pair_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),  # pair_segment
        Stack(dtype="int64")  # label
    ): [data for data in fn(samples)]

    dev_data_loader = create_dataloader(
        dev_ds,
        mode='dev',
        batch_size=args.batch_size,
        batchify_fn=batchify_fn_eval,
        trans_fn=trans_func_eval)

    model = PairwiseMatching(pretrained_model, margin=args.margin)

    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
        state_dict = paddle.load(args.init_from_ckpt)
        model.set_dict(state_dict)

    metric = paddle.metric.Auc()
    evaluate(model, metric, dev_data_loader, "dev")
Beispiel #16
0
def create_data_loader_for_small_model(task_name,
                                       vocab_path,
                                       model_name=None,
                                       batch_size=64,
                                       max_seq_length=128,
                                       shuffle=True):
    """Data loader for bi-lstm, not bert."""
    dataset_class = TASK_CLASSES[task_name]
    train_ds, dev_ds = dataset_class.get_datasets(['train', 'dev'])

    if task_name == 'senta':
        vocab = Vocab.load_vocabulary(
            vocab_path,
            unk_token='[UNK]',
            pad_token='[PAD]',
            bos_token=None,
            eos_token=None,
        )
        pad_val = vocab['[PAD]']

    else:
        vocab = BertTokenizer.from_pretrained(model_name)
        pad_val = vocab.pad_token_id

    trans_fn = partial(convert_small_example,
                       task_name=task_name,
                       vocab=vocab,
                       max_seq_length=max_seq_length,
                       is_test=False)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=pad_val),  # input_ids
        Stack(dtype="int64"),  # seq len
        Stack(dtype="int64")  # label
    ): [data for data in fn(samples)]

    train_ds = train_ds.apply(trans_fn, lazy=True)
    dev_ds = dev_ds.apply(trans_fn, lazy=True)

    train_data_loader, dev_data_loader = create_dataloader(
        train_ds, dev_ds, batch_size, batchify_fn, shuffle)

    return train_data_loader, dev_data_loader
Beispiel #17
0
    def preprocess(self, input_dicts, data_id, log_id):
        from paddlenlp.data import Stack, Tuple, Pad

        (_, input_dict), = input_dicts.items()
        print("input dict", input_dict)
        batch_size = len(input_dict.keys())
        examples = []
        for i in range(batch_size):
            input_ids, segment_ids = convert_example([input_dict[str(i)]],
                                                     self.tokenizer)
            examples.append((input_ids, segment_ids))
        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=self.tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=self.tokenizer.pad_token_id),  # segment
        ): fn(samples)
        input_ids, segment_ids = batchify_fn(examples)
        feed_dict = {}
        feed_dict['input_ids'] = input_ids
        feed_dict['token_type_ids'] = segment_ids
        return feed_dict, False, None, ""
Beispiel #18
0
def evaluate(args):
    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()
    paddle.set_device("gpu" if args.use_gpu else "cpu")

    # create dataset.
    test_dataset = LacDataset(args.data_dir, mode='test')
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0),  # word_ids
        Stack(),  # length
        Pad(axis=0, pad_val=0),  # label_ids
    ): fn(samples)

    # Create sampler for dataloader
    test_sampler = paddle.io.BatchSampler(dataset=test_dataset,
                                          batch_size=args.batch_size,
                                          shuffle=False,
                                          drop_last=False)
    test_loader = paddle.io.DataLoader(dataset=test_dataset,
                                       batch_sampler=test_sampler,
                                       places=place,
                                       return_list=True,
                                       collate_fn=batchify_fn)

    # Define the model network and metric evaluator
    network = BiGruCrf(args.emb_dim, args.hidden_size, test_dataset.vocab_size,
                       test_dataset.num_labels)
    inputs = InputSpec(shape=(-1, ), dtype="int16", name='inputs')
    lengths = InputSpec(shape=(-1, ), dtype="int16", name='lengths')
    model = paddle.Model(network, inputs=[inputs, lengths])
    chunk_evaluator = ChunkEvaluator(
        label_list=test_dataset.label_vocab.keys(), suffix=True)
    model.prepare(None, None, chunk_evaluator)

    # Load the model and start predicting
    model.load(args.init_checkpoint)
    model.evaluate(
        eval_data=test_loader,
        batch_size=args.batch_size,
        log_freq=100,
        verbose=2,
    )
Beispiel #19
0
def get_mnli_dev_dataloader(tokenizer, args, matched=True):
    if matched:
        split = "dev_matched"
    else:
        split = "dev_mismatched"
    filename = os.path.join("caches", args.task_name + f"_{split}" + ".pkl")
    if os.path.exists(filename):
        ds = load_pickle(filename)
    else:
        ds = load_dataset("glue", args.task_name, splits=split)
        ds.map(
            partial(trans_func, tokenizer=tokenizer, args=args),
            batched=False,
            lazy=False,
        )
        save_pickle(ds, filename)

    batch_sampler = BatchSampler(ds,
                                 batch_size=args.train_batch_size,
                                 shuffle=False)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
            ),  # input_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
            ),  # attention_mask
        Pad(axis=0, pad_val=-100, dtype="int64"),  # lm_labels
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
            ),  # decoder_attention_mask
    ): fn(samples)

    data_loader = DataLoader(
        dataset=ds,
        batch_sampler=batch_sampler,
        collate_fn=batchify_fn,
        num_workers=args.num_workers,
        return_list=True,
    )

    return data_loader
Beispiel #20
0
def create_pair_loader_for_small_model(task_name,
                                       model_name,
                                       vocab_path,
                                       batch_size=64,
                                       max_seq_length=128,
                                       shuffle=True,
                                       is_test=False):
    """Only support QQP now."""
    tokenizer = BertTokenizer.from_pretrained(model_name)
    dataset_class = TASK_CLASSES[task_name]

    train_ds, dev_ds = dataset_class.get_datasets(['train', 'dev'])
    vocab = Vocab.load_vocabulary(
        vocab_path,
        unk_token='[UNK]',
        pad_token='[PAD]',
        bos_token=None,
        eos_token=None,
    )

    trans_func = partial(convert_pair_example,
                         task_name=task_name,
                         vocab=tokenizer,
                         is_tokenized=False,
                         max_seq_length=max_seq_length,
                         is_test=is_test)
    train_ds = train_ds.apply(trans_func, lazy=True)
    dev_ds = dev_ds.apply(trans_func, lazy=True)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=vocab['[PAD]']),  # input
        Stack(),  # length
        Pad(axis=0, pad_val=vocab['[PAD]']),  # input
        Stack(),  # length
        Stack(dtype="int64" if train_ds.get_labels() else "float32")  # label
    ): [data for i, data in enumerate(fn(samples))]

    train_data_loader, dev_data_loader = create_dataloader(
        train_ds, dev_ds, batch_size, batchify_fn, shuffle)
    return train_data_loader, dev_data_loader
Beispiel #21
0
def evaluate(args):
    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()
    paddle.set_device("gpu" if args.use_gpu else "cpu")

    # create dataset.
    test_dataset = LacDataset(args.data_dir, mode='test')
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0),  # word_ids
        Stack(),  # length
        Pad(axis=0, pad_val=0),  # label_ids
    ): fn(samples)

    # Create sampler for dataloader
    test_sampler = paddle.io.BatchSampler(dataset=test_dataset,
                                          batch_size=args.batch_size,
                                          shuffle=False,
                                          drop_last=True)
    test_loader = paddle.io.DataLoader(dataset=test_dataset,
                                       batch_sampler=test_sampler,
                                       places=place,
                                       return_list=True,
                                       collate_fn=batchify_fn)

    # Define the model network and metric evaluator
    network = BiGruCrf(args.emb_dim, args.hidden_size, test_dataset.vocab_size,
                       test_dataset.num_labels)
    model = paddle.Model(network)
    chunk_evaluator = ChunkEvaluator(
        int(math.ceil((test_dataset.num_labels + 1) / 2.0)),
        "IOB")  # + 1 for SOS and EOS
    model.prepare(None, None, chunk_evaluator)

    # Load the model and start predicting
    model.load(args.init_checkpoint)
    model.evaluate(
        eval_data=test_loader,
        batch_size=args.batch_size,
        log_freq=100,
        verbose=2,
    )
Beispiel #22
0
        def preprocess_fn(data):
            examples = []

            if not isinstance(data, list):
                data = [data]

            for text in data:
                input_ids, segment_ids = convert_example(text,
                                                         tokenizer,
                                                         max_seq_length=128,
                                                         is_test=True)
                examples.append((input_ids, segment_ids))

            batchify_fn = lambda samples, fn=Tuple(
                Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input id
                Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment id
            ): fn(samples)

            input_ids, segment_ids = batchify_fn(examples)
            return paddle.to_tensor(input_ids,
                                    stop_gradient=False), paddle.to_tensor(
                                        segment_ids, stop_gradient=False)
Beispiel #23
0
def create_pair_loader_for_small_model(task_name,
                                       model_name,
                                       vocab_path,
                                       batch_size=64,
                                       max_seq_length=128,
                                       shuffle=True,
                                       is_test=False):
    """Only support QQP now."""
    tokenizer = BertTokenizer.from_pretrained(model_name)
    train_ds, dev_ds = load_dataset('glue', task_name, splits=["train", "dev"])
    vocab = Vocab.load_vocabulary(
        vocab_path,
        unk_token='[UNK]',
        pad_token='[PAD]',
        bos_token=None,
        eos_token=None,
    )

    trans_func = partial(convert_pair_example,
                         task_name=task_name,
                         vocab=tokenizer,
                         is_tokenized=False,
                         max_seq_length=max_seq_length,
                         is_test=is_test)
    train_ds = train_ds.map(trans_func, lazy=True)
    dev_ds = dev_ds.map(trans_func, lazy=True)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=vocab['[PAD]']),  # input
        Stack(),  # length
        Pad(axis=0, pad_val=vocab['[PAD]']),  # input
        Stack(),  # length
        Stack(dtype="int64" if train_ds.label_list else "float32")  # label
    ): fn(samples)

    train_data_loader, dev_data_loader = create_dataloader(
        train_ds, dev_ds, batch_size, batchify_fn, shuffle)
    return train_data_loader, dev_data_loader
Beispiel #24
0
    def predict(self, data, tokenizer, label_map):
        """
        Predicts the data labels.
        Args:
            data (obj:`List(str)`): The batch data whose each element is a raw text.
            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 
                which contains most of the methods. Users should refer to the superclass for more information regarding methods.
            label_map(obj:`dict`): The label id (key) to label str (value) map.
        Returns:
            results(obj:`dict`): All the predictions labels.
        """
        examples = []
        for text in data:
            example = {"text": text}
            input_ids, segment_ids = convert_example(
                example,
                tokenizer,
                max_seq_length=self.max_seq_length,
                is_test=True)
            examples.append((input_ids, segment_ids))

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
        ): fn(samples)

        input_ids, segment_ids = batchify_fn(examples)
        self.input_handles[0].copy_from_cpu(input_ids)
        self.input_handles[1].copy_from_cpu(segment_ids)
        self.predictor.run()
        logits = self.output_handle.copy_to_cpu()

        probs = softmax(logits, axis=1)
        idx = np.argmax(probs, axis=1)
        idx = idx.tolist()
        labels = [label_map[i] for i in idx]

        return labels
Beispiel #25
0
def create_pretraining_dataset(input_file, shared_list, args, worker_init,
                               tokenizer):
    train_data = PretrainingDataset(input_file=input_file,
                                    tokenizer=tokenizer,
                                    max_seq_length=args.max_seq_length)
    # files have been sharded, no need to dispatch again
    train_batch_sampler = paddle.io.BatchSampler(train_data,
                                                 batch_size=args.batch_size,
                                                 shuffle=True)

    # DataLoader cannot be pickled because of its place.
    # If it can be pickled, use global function instead of lambda and use
    # ProcessPoolExecutor instead of ThreadPoolExecutor to prefetch.
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
    ): fn(samples)

    train_data_loader = DataLoader(dataset=train_data,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   worker_init_fn=worker_init,
                                   return_list=True)
    return train_data_loader, input_file
Beispiel #26
0
def main(args):
    paddle.set_device('gpu' if args.n_gpu else 'cpu')
    world_size = dist.get_world_size()
    rank = dist.get_rank()
    if world_size > 1 and args.do_train:
        dist.init_parallel_env()

    set_seed(args.seed)

    dataset_class, metric_class = TASK_CLASSES[args.task_name]
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
    trans_func = partial(dataset_class.convert_example,
                         tokenizer=tokenizer,
                         max_seq_length=args.max_seq_len)
    test_trans_func = partial(dataset_class.convert_example,
                              tokenizer=tokenizer,
                              max_seq_length=args.test_max_seq_len)
    metric = metric_class()

    if args.task_name in ('udc', 'dstc2', 'atis_intent', 'mrda', 'swda'):
        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
            Stack(dtype='int64')  # label
        ): fn(samples)
        model = BertForSequenceClassification.from_pretrained(
            args.model_name_or_path, num_classes=dataset_class.num_classes())
    elif args.task_name == 'atis_slot':
        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
            Pad(axis=0, pad_val=0, dtype='int64')  # label
        ): fn(samples)
        model = BertForTokenClassification.from_pretrained(
            args.model_name_or_path,
            num_classes=dataset_class.num_classes(),
            dropout=0.0)
    if world_size > 1 and args.do_train:
        model = paddle.DataParallel(model)

    if args.do_train:
        train_data_loader = create_data_loader(args, dataset_class, trans_func,
                                               batchify_fn, 'train')
        if args.do_eval:
            dev_data_loader = create_data_loader(args, dataset_class,
                                                 test_trans_func, batchify_fn,
                                                 'dev')
        else:
            dev_data_loader = None
        train(args, model, train_data_loader, dev_data_loader, metric, rank)

    if args.do_test:
        if rank == 0:
            test_data_loader = create_data_loader(args, dataset_class,
                                                  test_trans_func, batchify_fn,
                                                  'test')
            if args.do_train:
                # If do_eval=True, use best model to evaluate the test data.
                # Otherwise, use final model to evaluate the test data.
                if args.do_eval:
                    args.init_from_ckpt = os.path.join(args.output_dir, 'best')
                    load_ckpt(args, model)
            else:
                if not args.init_from_ckpt:
                    raise ValueError('"init_from_ckpt" should be set.')
                load_ckpt(args, model)
            print('\nTest begin...')
            evaluation(args, model, test_data_loader, metric)
Beispiel #27
0
    pinyin_ids = encoded_inputs["pinyin_ids"]

    label = np.array([example["label"]], dtype="int64")
    return input_ids, pinyin_ids, label


# Process the data into a data format that the model can read in.
trans_func = partial(convert_example,
                     tokenizer=tokenizer,
                     max_seq_length=args.max_seq_length)

# Form data into batch data, such as padding text sequences of different lengths into the maximum length of batch data,
# and stack each data label together
batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input_ids
    # Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # token_type_ids
    Pad(axis=0, pad_val=0),  # pinyin_ids
    Stack()  # labels
): [data for data in fn(samples)]

from utils import create_dataloader

train_data_loader = create_dataloader(train_ds,
                                      mode='train',
                                      batch_size=args.batch_size,
                                      batchify_fn=batchify_fn,
                                      trans_fn=trans_func)
dev_data_loader = create_dataloader(dev_ds,
                                    mode='dev',
                                    batch_size=args.batch_size,
                                    batchify_fn=batchify_fn,
                                    trans_fn=trans_func)
Beispiel #28
0
def do_train():
    paddle.set_device(args.device)
    rank = paddle.distributed.get_rank()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args.seed)

    # Load train dataset.
    file_name = 'train.csv'
    train_ds = load_dataset(read_custom_data,
                            filename=os.path.join(args.data_path, file_name),
                            is_test=False,
                            lazy=False)

    pretrained_model = ppnlp.transformers.BertModel.from_pretrained(
        "bert-base-uncased")
    tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained(
        'bert-base-uncased')

    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         max_seq_length=args.max_seq_length)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
        Stack(dtype='float32')  # label
    ): [data for data in fn(samples)]
    train_data_loader = create_dataloader(train_ds,
                                          mode='train',
                                          batch_size=args.batch_size,
                                          batchify_fn=batchify_fn,
                                          trans_fn=trans_func)

    model = MultiLabelClassifier(pretrained_model,
                                 num_labels=len(train_ds.data[0]["label"]))

    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
        state_dict = paddle.load(args.init_from_ckpt)
        model.set_dict(state_dict)
    model = paddle.DataParallel(model)
    num_training_steps = len(train_data_loader) * args.epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps,
                                         args.warmup_proportion)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    metric = MultiLabelReport()
    criterion = paddle.nn.BCEWithLogitsLoss()

    global_step = 0
    tic_train = time.time()
    for epoch in range(1, args.epochs + 1):
        for step, batch in enumerate(train_data_loader, start=1):
            input_ids, token_type_ids, labels = batch
            logits = model(input_ids, token_type_ids)
            loss = criterion(logits, labels)
            probs = F.sigmoid(logits)
            metric.update(probs, labels)
            auc, f1_score = metric.accumulate()

            global_step += 1
            if global_step % 10 == 0 and rank == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %.5f, auc: %.5f, f1 score: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, step, loss, auc, f1_score, 10 /
                       (time.time() - tic_train)))
                tic_train = time.time()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % 100 == 0 and rank == 0:
                save_dir = os.path.join(args.save_dir,
                                        "model_%d" % global_step)
                if not os.path.exists(save_dir):
                    os.makedirs(save_dir)
                save_param_path = os.path.join(save_dir,
                                               "model_state.pdparams")
                paddle.save(model.state_dict(), save_param_path)
                tokenizer.save_pretrained(save_dir)
Beispiel #29
0
def train(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")

    # create dataset.
    train_dataset = LacDataset(args.data_dir, mode='train')
    test_dataset = LacDataset(args.data_dir, mode='test')

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0),  # word_ids
        Stack(),  # length
        Pad(axis=0, pad_val=0),  # label_ids
    ): fn(samples)

    # Create sampler for dataloader
    train_sampler = paddle.io.DistributedBatchSampler(
        dataset=train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True)
    train_loader = paddle.io.DataLoader(dataset=train_dataset,
                                        batch_sampler=train_sampler,
                                        return_list=True,
                                        collate_fn=batchify_fn)

    test_sampler = paddle.io.BatchSampler(dataset=test_dataset,
                                          batch_size=args.batch_size,
                                          shuffle=False,
                                          drop_last=False)
    test_loader = paddle.io.DataLoader(dataset=test_dataset,
                                       batch_sampler=test_sampler,
                                       return_list=True,
                                       collate_fn=batchify_fn)

    # Define the model netword and its loss
    network = BiGruCrf(args.emb_dim, args.hidden_size,
                       train_dataset.vocab_size, train_dataset.num_labels)
    model = paddle.Model(network)

    # Prepare optimizer, loss and metric evaluator
    optimizer = paddle.optimizer.Adam(learning_rate=args.base_lr,
                                      parameters=model.parameters())
    crf_loss = LinearChainCrfLoss(network.crf)
    chunk_evaluator = ChunkEvaluator(
        label_list=train_dataset.label_vocab.keys(), suffix=True)
    model.prepare(optimizer, crf_loss, chunk_evaluator)
    if args.init_checkpoint:
        model.load(args.init_checkpoint)

    # Start training
    callbacks = paddle.callbacks.ProgBarLogger(
        log_freq=10, verbose=3) if args.verbose else None
    model.fit(train_data=train_loader,
              eval_data=test_loader,
              batch_size=args.batch_size,
              epochs=args.epochs,
              eval_freq=1,
              log_freq=10,
              save_dir=args.model_save_dir,
              save_freq=1,
              shuffle=True,
              callbacks=callbacks)
Beispiel #30
0
def do_train(args):
    paddle.set_device(args.select_device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)

    args.task_name = args.task_name.lower()
    dataset_class, metric_class = TASK_CLASSES[args.task_name]
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

    train_dataset = dataset_class.get_datasets(["train"])
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         label_list=train_dataset.get_labels(),
                         max_seq_length=args.max_seq_length)
    train_dataset = train_dataset.apply(trans_func, lazy=True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_dataset, batch_size=args.batch_size, shuffle=True)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
        Stack(),  # length
        Stack(dtype="int64"
              if train_dataset.get_labels() else "float32")  # label
    ): [data for i, data in enumerate(fn(samples)) if i != 2]
    train_data_loader = DataLoader(dataset=train_dataset,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   return_list=True)
    if args.task_name == "mnli":
        dev_dataset_matched, dev_dataset_mismatched = dataset_class.get_datasets(
            ["dev_matched", "dev_mismatched"])
        dev_dataset_matched = dev_dataset_matched.apply(trans_func, lazy=True)
        dev_dataset_mismatched = dev_dataset_mismatched.apply(trans_func,
                                                              lazy=True)
        dev_batch_sampler_matched = paddle.io.BatchSampler(
            dev_dataset_matched, batch_size=args.batch_size, shuffle=False)
        dev_data_loader_matched = DataLoader(
            dataset=dev_dataset_matched,
            batch_sampler=dev_batch_sampler_matched,
            collate_fn=batchify_fn,
            num_workers=0,
            return_list=True)
        dev_batch_sampler_mismatched = paddle.io.BatchSampler(
            dev_dataset_mismatched, batch_size=args.batch_size, shuffle=False)
        dev_data_loader_mismatched = DataLoader(
            dataset=dev_dataset_mismatched,
            batch_sampler=dev_batch_sampler_mismatched,
            collate_fn=batchify_fn,
            num_workers=0,
            return_list=True)
    else:
        dev_dataset = dataset_class.get_datasets(["dev"])
        dev_dataset = dev_dataset.apply(trans_func, lazy=True)
        dev_batch_sampler = paddle.io.BatchSampler(dev_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=False)
        dev_data_loader = DataLoader(dataset=dev_dataset,
                                     batch_sampler=dev_batch_sampler,
                                     collate_fn=batchify_fn,
                                     num_workers=0,
                                     return_list=True)

    num_classes = 1 if train_dataset.get_labels() == None else len(
        train_dataset.get_labels())
    model = model_class.from_pretrained(args.model_name_or_path,
                                        num_classes=num_classes)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    num_training_steps = args.max_steps if args.max_steps > 0 else (
        len(train_data_loader) * args.num_train_epochs)
    warmup_steps = args.warmup_steps if args.warmup_steps > 0 else (int(
        math.floor(num_training_steps * args.warmup_proportion)))
    lr_scheduler = paddle.optimizer.lr.LambdaDecay(
        args.learning_rate,
        lambda current_step, num_warmup_steps=warmup_steps, num_training_steps=
        num_training_steps: float(current_step) / float(
            max(1, num_warmup_steps))
        if current_step < num_warmup_steps else max(
            0.0,
            float(num_training_steps - current_step) / float(
                max(1, num_training_steps - num_warmup_steps))))

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        beta1=0.9,
        beta2=0.999,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])

    loss_fct = paddle.nn.loss.CrossEntropyLoss() if train_dataset.get_labels(
    ) else paddle.nn.loss.MSELoss()

    metric = metric_class()

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, segment_ids, labels = batch
            logits = model(input_ids, segment_ids)
            loss = loss_fct(logits, labels)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_gradients()
            if global_step % args.logging_steps == 0:
                logger.info(
                    "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s"
                    % (global_step, num_training_steps, epoch, step,
                       paddle.distributed.get_rank(), loss, optimizer.get_lr(),
                       args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()
            if global_step % args.save_steps == 0:
                tic_eval = time.time()
                if args.task_name == "mnli":
                    evaluate(model, loss_fct, metric, dev_data_loader_matched)
                    evaluate(model, loss_fct, metric,
                             dev_data_loader_mismatched)
                    logger.info("eval done total : %s s" %
                                (time.time() - tic_eval))
                else:
                    evaluate(model, loss_fct, metric, dev_data_loader)
                    logger.info("eval done total : %s s" %
                                (time.time() - tic_eval))
                if (not args.n_cards > 1
                    ) or paddle.distributed.get_rank() == 0:
                    output_dir = os.path.join(
                        args.output_dir, "%s_ft_model_%d.pdparams" %
                        (args.task_name, global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Need better way to get inner model of DataParallel
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)