Beispiel #1
0
def create_dataloader(dataset,
                      trans_fn=None,
                      mode='train',
                      batch_size=1,
                      pad_token_id=0):
    """
    Creats dataloader.
    Args:
        dataset(obj:`paddle.io.Dataset`): Dataset instance.
        mode(obj:`str`, optional, defaults to obj:`train`): If mode is 'train', it will shuffle the dataset randomly.
        batch_size(obj:`int`, optional, defaults to 1): The sample number of a mini-batch.
        pad_token_id(obj:`int`, optional, defaults to 0): The pad token index.
    Returns:
        dataloader(obj:`paddle.io.DataLoader`): The dataloader which generates batches.
    """
    if trans_fn:
        dataset = dataset.map(trans_fn, lazy=True)

    shuffle = True if mode == 'train' else False
    sampler = paddle.io.BatchSampler(
        dataset=dataset, batch_size=batch_size, shuffle=shuffle)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=vocab.get('[PAD]', 0)),  # input_ids
        Stack(dtype="int32"),  # seq len
        Stack(dtype="int64")  # label
    ): [data for data in fn(samples)]

    dataloader = paddle.io.DataLoader(
        dataset,
        batch_sampler=sampler,
        return_list=True,
        collate_fn=batchify_fn)
    return dataloader
Beispiel #2
0
def prepare_train_input(insts,
                        bos_idx,
                        eos_idx,
                        pad_idx,
                        pad_seq=1,
                        dtype="int64"):
    """
    Put all padded data needed by training into a list.
    """
    word_pad = Pad(pad_idx, dtype=dtype)
    src_max_len = (max([len(inst[0])
                        for inst in insts]) + pad_seq) // pad_seq * pad_seq
    trg_max_len = (max([len(inst[1])
                        for inst in insts]) + pad_seq) // pad_seq * pad_seq
    src_word = word_pad([
        inst[0] + [eos_idx] + [pad_idx] * (src_max_len - 1 - len(inst[0]))
        for inst in insts
    ])
    trg_word = word_pad([[bos_idx] + inst[1] + [pad_idx] *
                         (trg_max_len - 1 - len(inst[1])) for inst in insts])
    lbl_word = np.expand_dims(word_pad([
        inst[1] + [eos_idx] + [pad_idx] * (trg_max_len - 1 - len(inst[1]))
        for inst in insts
    ]),
                              axis=2)

    data_inputs = [src_word, trg_word, lbl_word]

    return data_inputs
def init_lstm_var(args):
    vocab = Vocab.load_vocabulary(args.vocab_path,
                                  unk_token='[UNK]',
                                  pad_token='[PAD]')
    tokenizer = CharTokenizer(vocab, args.language, '../../punctuations')
    padding_idx = vocab.token_to_idx.get('[PAD]', 0)

    trans_fn = partial(convert_example,
                       tokenizer=tokenizer,
                       is_test=True,
                       language=args.language)

    # Init attention layer
    lstm_hidden_size = 196
    attention = SelfInteractiveAttention(hidden_size=2 * lstm_hidden_size)
    model = BiLSTMAttentionModel(attention_layer=attention,
                                 vocab_size=len(tokenizer.vocab),
                                 lstm_hidden_size=lstm_hidden_size,
                                 num_classes=2,
                                 padding_idx=padding_idx)

    # Reads data and generates mini-batches.
    dev_ds = Senti_data().read(args.data_dir)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=padding_idx),  # input_ids
        Stack(dtype="int64"),  # seq len
    ): [data for data in fn(samples)]

    dev_loader = create_dataloader(dev_ds,
                                   trans_fn=trans_fn,
                                   batch_size=args.batch_size,
                                   mode='validation',
                                   batchify_fn=batchify_fn)

    return model, tokenizer, dev_loader
    def __init__(self, args={}):
        super(TransformerReader, self).__init__()

        dataset = load_dataset('wmt14ende', splits=('test'))
        if not args.benchmark:
            self.vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"])
        else:
            self.vocab = Vocab.load_vocabulary(
                **dataset.vocab_info["benchmark"])
        self.src_vocab = self.trg_vocab = self.vocab

        def convert_samples(samples):
            source = []
            for sample in samples:
                src = sample.split()
                source.append(self.src_vocab.to_indices(src))

            return source

        self.tokenize = convert_samples
        self.to_tokens = self.trg_vocab.to_tokens
        self.feed_keys = ["src_word"]
        self.bos_idx = args.bos_idx
        self.eos_idx = args.eos_idx
        self.pad_idx = args.bos_idx
        self.pad_seq = args.pad_seq
        self.word_pad = Pad(self.pad_idx)
Beispiel #5
0
    def predict(self, data, tokenizer, batch_size=1, threshold=0.5):
        """
        Predicts the data labels.

        Args:
            data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object.
                A Example object contains `text`(word_ids) and `se_len`(sequence length).
            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 
                which contains most of the methods. Users should refer to the superclass for more information regarding methods.
            batch_size(obj:`int`, defaults to 1): The number of batch.
            threshold(obj:`int`, defaults to 0.5): The threshold for converting probabilities to labels.

        Returns:
            results(obj:`dict`): All the predictions labels.
        """
        examples = []
        for text in data:
            example = {"text": text}
            input_ids, segment_ids = convert_example(
                example,
                tokenizer,
                max_seq_length=self.max_seq_length,
                is_test=True)
            examples.append((input_ids, segment_ids))

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
        ): fn(samples)

        # Seperates data into some batches.
        batches = [
            examples[idx:idx + batch_size]
            for idx in range(0, len(examples), batch_size)
        ]

        results = []
        for batch in batches:
            input_ids, segment_ids = batchify_fn(batch)
            self.input_handles[0].copy_from_cpu(input_ids)
            self.input_handles[1].copy_from_cpu(segment_ids)
            self.predictor.run()
            logits = paddle.to_tensor(self.output_handle.copy_to_cpu())
            probs = F.sigmoid(logits)
            preds = (probs.numpy() > threshold).astype(int)
            results.extend(preds)
        return results
Beispiel #6
0
def predict_cls(args, ext_results):
    # load dict
    model_name = "skep_ernie_1.0_large_ch"
    cls_label2id, cls_id2label = load_dict(args.cls_label_path)

    tokenizer = SkepTokenizer.from_pretrained(model_name)
    test_ds = MapDataset(ext_results)
    trans_func = partial(convert_example_to_feature_cls,
                         tokenizer=tokenizer,
                         label2id=cls_label2id,
                         max_seq_len=args.cls_max_seq_len,
                         is_test=True)
    test_ds = test_ds.map(trans_func, lazy=False)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id), Stack(dtype="int64")
    ): fn(samples)

    # set shuffle is False
    test_batch_sampler = paddle.io.BatchSampler(test_ds,
                                                batch_size=args.batch_size,
                                                shuffle=False)
    test_loader = paddle.io.DataLoader(test_ds,
                                       batch_sampler=test_batch_sampler,
                                       collate_fn=batchify_fn)
    print("test data loaded.")

    # load cls model
    cls_state_dict = paddle.load(args.cls_model_path)
    cls_model = SkepForSequenceClassification.from_pretrained(
        model_name, num_classes=len(cls_label2id))
    cls_model.load_dict(cls_state_dict)
    print("classification model loaded.")

    cls_model.eval()

    results = []
    for bid, batch_data in enumerate(test_loader):
        input_ids, token_type_ids, seq_lens = batch_data
        logits = cls_model(input_ids, token_type_ids=token_type_ids)

        predictions = logits.argmax(axis=1).numpy().tolist()
        results.extend(predictions)

    results = [cls_id2label[pred_id] for pred_id in results]
    return results
Beispiel #7
0
def infer(args):
    paddle.set_device(args.device)

    # create dataset.
    infer_dataset = LacDataset(args.data_dir, mode='infer')

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0, dtype='int64'),  # word_ids
        Stack(dtype='int64'),  # length
    ): fn(samples)

    # Create sampler for dataloader
    infer_sampler = paddle.io.BatchSampler(
        dataset=infer_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        drop_last=False)
    infer_loader = paddle.io.DataLoader(
        dataset=infer_dataset,
        batch_sampler=infer_sampler,
        return_list=True,
        collate_fn=batchify_fn)

    # Define the model network
    network = BiGruCrf(args.emb_dim, args.hidden_size, infer_dataset.vocab_size,
                       infer_dataset.num_labels)
    inputs = InputSpec(shape=(-1, ), dtype="int64", name='inputs')
    lengths = InputSpec(shape=(-1, ), dtype="int64", name='lengths')
    model = paddle.Model(network, inputs=[inputs, lengths])
    model.prepare()

    # Load the model and start predicting
    model.load(args.init_checkpoint)
    emissions, lengths, crf_decodes = model.predict(
        test_data=infer_loader, batch_size=args.batch_size)

    # Post-processing the lexical analysis results
    lengths = np.array([l for lens in lengths for l in lens]).reshape([-1])
    preds = np.array(
        [pred for batch_pred in crf_decodes for pred in batch_pred])

    results = parse_lac_result(infer_dataset.word_ids, preds, lengths,
                               infer_dataset.word_vocab,
                               infer_dataset.label_vocab)

    sent_tags = []
    for sent, tags in results:
        sent_tag = ['(%s, %s)' % (ch, tag) for ch, tag in zip(sent, tags)]
        sent_tags.append(''.join(sent_tag))

    file_path = "results.txt"
    with open(file_path, "w", encoding="utf8") as fout:
        fout.write("\n".join(sent_tags))

    # Print some examples
    print(
        "The results have been saved in the file: %s, some examples are shown below: "
        % file_path)
    print("\n".join(sent_tags[:10]))
Beispiel #8
0
    def _batchify(self, data: List[List[str]], max_seq_len: int,
                  batch_size: int):
        """
        Generate input batches.
        """
        padding = False if batch_size == 1 else True
        pad_func = Pad(pad_val=self.tokenizer.pad_token_id, pad_right=False)

        def pad_mask(batch_attention_mask):
            batch_size = len(batch_attention_mask)
            max_len = max(map(len, batch_attention_mask))
            attention_mask = np.ones(
                (batch_size, max_len, max_len), dtype='float32') * -1e9
            for i, mask_data in enumerate(attention_mask):
                seq_len = len(batch_attention_mask[i])
                mask_data[-seq_len:,
                          -seq_len:] = np.array(batch_attention_mask[i],
                                                dtype='float32')
            # In order to ensure the correct broadcasting mechanism, expand one
            # dimension to the second dimension (n_head of Transformer).
            attention_mask = np.expand_dims(attention_mask, axis=1)
            return attention_mask

        def _parse_batch(batch_examples):
            if padding:
                input_ids = pad_func(
                    [example['input_ids'] for example in batch_examples])
                token_type_ids = pad_func(
                    [example['token_type_ids'] for example in batch_examples])
                position_ids = pad_func(
                    [example['position_ids'] for example in batch_examples])
                attention_mask = pad_mask(
                    [example['attention_mask'] for example in batch_examples])
            else:
                input_ids = np.asarray(
                    [example['input_ids'] for example in batch_examples])
                token_type_ids = np.asarray(
                    [example['token_type_ids'] for example in batch_examples])
                position_ids = np.asarray(
                    [example['position_ids'] for example in batch_examples])
                attention_mask = np.asarray(
                    [example['attention_mask'] for example in batch_examples])
                attention_mask = np.expand_dims(attention_mask, 0)

            return input_ids, token_type_ids, position_ids, attention_mask

        examples = []
        for texts in data:
            examples.append(self._convert_text_to_input(texts, max_seq_len))

        # Seperates data into some batches.
        one_batch = []
        for example in examples:
            one_batch.append(example)
            if len(one_batch) == batch_size:
                yield _parse_batch(one_batch)
                one_batch = []
        if one_batch:
            yield _parse_batch(one_batch)
Beispiel #9
0
def prepare_train_input(insts, bos_id, eos_id, pad_id):
    # Add eos token id and bos token id.
    src = [[bos_id] + inst + [eos_id] for inst in insts]
    trg = [inst[:-1] for inst in insts]
    label = [inst[1:] for inst in insts]

    # Pad sequence using eos id.
    src, src_length = Pad(pad_val=pad_id, ret_length=True,
                          dtype="int64")([ids for ids in src])
    trg, trg_length = Pad(pad_val=pad_id, ret_length=True,
                          dtype="int64")([ids for ids in trg])
    label, _ = Pad(pad_val=pad_id, ret_length=True,
                   dtype="int64")([ids for ids in label])

    label = np.array(label)
    label = label.reshape((label.shape[0], label.shape[1], 1))
    return src, src_length, trg, trg_length, label
Beispiel #10
0
    def predict(self, data, tokenizer):
        """
        Predicts the data labels.

        Args:
            data (obj:`List(str)`): The batch data whose each element is a raw text.
            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 
                which contains most of the methods. Users should refer to the superclass for more information regarding methods.

        Returns:
            results(obj:`dict`): All the predictions labels.
        """

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
        ): fn(samples)

        all_embeddings = []
        examples = []
        for idx, text in enumerate(tqdm(data)):
            input_ids, segment_ids = convert_example(
                text,
                tokenizer,
                max_seq_length=self.max_seq_length,
                pad_to_max_seq_len=True)
            examples.append((input_ids, segment_ids))
            if (len(examples) >= 100):
                input_ids, segment_ids = batchify_fn(examples)
                self.input_handles[0].copy_from_cpu(input_ids)
                self.input_handles[1].copy_from_cpu(segment_ids)
                self.predictor.run()
                logits = self.output_handle.copy_to_cpu()
                all_embeddings.append(logits)
                examples = []

        if (len(examples) > 0):
            input_ids, segment_ids = batchify_fn(examples)
            self.input_handles[0].copy_from_cpu(input_ids)
            self.input_handles[1].copy_from_cpu(segment_ids)
            self.predictor.run()
            logits = self.output_handle.copy_to_cpu()
            all_embeddings.append(logits)

        all_embeddings = np.concatenate(all_embeddings, axis=0)
        np.save('corpus_embedding', all_embeddings)
Beispiel #11
0
def prepare_infer_input(insts, bos_idx, eos_idx, pad_idx):
    """
    Put all padded data needed by beam search decoder into a list.
    """
    word_pad = Pad(pad_idx)
    src_word = word_pad([inst[0] + [eos_idx] for inst in insts])

    return [src_word, ]
Beispiel #12
0
    def predict(self, data, tokenizer):
        """
        Predicts the data labels.

        Args:
            data (obj:`List(str)`): The batch data whose each element is a raw text.
            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 
                which contains most of the methods. Users should refer to the superclass for more information regarding methods.

        Returns:
            results(obj:`dict`): All the predictions labels.
        """
        if args.benchmark:
            self.autolog.times.start()

        examples = []
        for text in data:
            input_ids, segment_ids = convert_example(text,
                                                     tokenizer,
                                                     is_test=True)
            examples.append((input_ids, segment_ids))

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
        ): fn(samples)

        if args.benchmark:
            self.autolog.times.stamp()

        input_ids, segment_ids = batchify_fn(examples)
        self.input_handles[0].copy_from_cpu(input_ids)
        self.input_handles[1].copy_from_cpu(segment_ids)
        self.predictor.run()
        logits = self.output_handle.copy_to_cpu()
        if args.benchmark:
            self.autolog.times.stamp()

        probs = softmax(logits, axis=1)
        idx = np.argmax(probs, axis=1)
        idx = idx.tolist()

        if args.benchmark:
            self.autolog.times.end(stamp=True)

        return probs
Beispiel #13
0
def build_data_loader(args, tokenizer):
    """ build corpus_data_loader and text_data_loader
    """

    id2corpus = gen_id2corpus(args.corpus_file)

    # conver_example function's input must be dict
    corpus_list = [{idx: text} for idx, text in id2corpus.items()]
    corpus_ds = MapDataset(corpus_list)

    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         max_seq_length=args.max_seq_length)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # text_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # text_segment
    ): [data for data in fn(samples)]

    corpus_data_loader = create_dataloader(corpus_ds,
                                           mode='predict',
                                           batch_size=args.batch_size,
                                           batchify_fn=batchify_fn,
                                           trans_fn=trans_func)

    # build text data_loader
    text_list, text2similar_text = gen_text_file(args.similar_text_pair_file)

    text_ds = MapDataset(text_list)

    text_data_loader = create_dataloader(text_ds,
                                         mode='predict',
                                         batch_size=args.batch_size,
                                         batchify_fn=batchify_fn,
                                         trans_fn=trans_func)

    d = {
        "text_data_loader": text_data_loader,
        "corpus_data_loader": corpus_data_loader,
        "id2corpus": id2corpus,
        "text2similar_text": text2similar_text,
        "text_list": text_list
    }

    return d
Beispiel #14
0
def create_test_dataloader(args):
    '''
    构建测试用的dataloader
    Create dataset, tokenizer and dataloader.

    input:
        args: 配置文件提供的参数借口 
    return: 
        test_data_loader 
    '''
    no_entity_id = 0

    # 加载dataset
    test_ds = load_dataset('TEDTalk', splits=('test'), lazy=False)

    # 构建dataloader
    model_name_or_path = args.model_name_or_path
    tokenizer = ElectraTokenizer.from_pretrained(model_name_or_path)

    trans_func = partial(tokenize_and_align_labels,
                         tokenizer=tokenizer,
                         no_entity_id=no_entity_id,
                         max_seq_len=args.max_seq_length)

    batchify_fn = lambda samples, fn=Dict({
        'input_ids':
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'),  # input
        'token_type_ids':
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32'
            ),  # segment
        'seq_len':
        Stack(dtype='int64'),  # seq_len
        'labels':
        Pad(axis=0, pad_val=args.ignore_label, dtype='int64')  # label
    }): fn(samples)

    test_ds = test_ds.map(trans_func)

    test_data_loader = DataLoader(dataset=test_ds,
                                  collate_fn=batchify_fn,
                                  num_workers=0,
                                  batch_size=args.batch_size,
                                  return_list=True)

    return test_data_loader
Beispiel #15
0
    def predict(self,
                data,
                tokenizer,
                label_map,
                batch_size=1,
                network="bilstm"):
        """
        Predicts the data labels.

        Args:
            model (obj:`paddle.nn.Layer`): A model to classify texts.
            data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object.
                A Example object contains `text`(word_ids) and `se_len`(sequence length).
            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from 
                :class:`~paddlenlp.transformers.PretrainedTokenizer` which contains most of the methods.
                 Users should refer to the superclass for more information regarding methods.
            label_map(obj:`dict`): The label id (key) to label str (value) map.
            batch_size(obj:`int`, defaults to 1): The number of batch.

        Returns:
            results(obj:`dict`): All the predictions labels.
        """
        examples = []
        for text in data:
            input_id, seq_len = preprocess_prediction_data(text, tokenizer)
            examples.append((input_id, seq_len))

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.vocab.token_to_idx.get("[PAD]", 0)
                ),  # input_id
            Stack()  # seq_len
        ): fn(samples)

        # Seperates data into some batches.
        batches = [
            examples[idx:idx + batch_size]
            for idx in range(0, len(examples), batch_size)
        ]

        results = []
        for batch in batches:
            input_ids, seq_lens = batchify_fn(batch)
            self.input_handles[0].copy_from_cpu(input_ids)
            if network in [
                    "lstm", "bilstm", "gru", "bigru", "rnn", "birnn",
                    "bilstm_attn"
            ]:
                self.input_handles[1].copy_from_cpu(seq_lens)
            self.predictor.run()
            logits = self.output_handle.copy_to_cpu()
            probs = softmax(logits, axis=1)
            print(probs)
            idx = np.argmax(probs, axis=1)
            idx = idx.tolist()
            labels = [label_map[i] for i in idx]
            results.extend(labels)
        return results
Beispiel #16
0
def pad_sequence_paddle(inputs, lens, pad_index=0):
    sequences = []
    idx = 0
    for l in lens:
        sequences.append(inputs[idx:idx + l])
        idx += l
    outputs = Pad(pad_val=pad_index)(sequences)
    output_tensor = paddle.to_tensor(outputs)
    return output_tensor
Beispiel #17
0
def predict(model, data, label_map, batch_size=1, pad_token_id=0):
    """
    Predicts the data labels.

    Args:
        model (obj:`paddle.nn.Layer`): A model to classify texts.
        data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object.
            A Example object contains `text`(word_ids) and `seq_len`(sequence length).
        label_map(obj:`dict`): The label id (key) to label str (value) map.
        batch_size(obj:`int`, defaults to 1): The number of batch.
        pad_token_id(obj:`int`, optional, defaults to 0): The pad token index.

    Returns:
        results(obj:`dict`): All the predictions labels.
    """

    # Seperates data into some batches.
    batches = [
        data[idx:idx + batch_size] for idx in range(0, len(data), batch_size)
    ]

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=pad_token_id),  # query_ids
        Pad(axis=0, pad_val=pad_token_id),  # title_ids
        Stack(dtype="int64"),  # query_seq_lens
        Stack(dtype="int64"),  # title_seq_lens
    ): [data for data in fn(samples)]

    results = []
    model.eval()
    for batch in batches:
        query_ids, title_ids, query_seq_lens, title_seq_lens = batchify_fn(
            batch)
        query_ids = paddle.to_tensor(query_ids)
        title_ids = paddle.to_tensor(title_ids)
        query_seq_lens = paddle.to_tensor(query_seq_lens)
        title_seq_lens = paddle.to_tensor(title_seq_lens)
        logits = model(query_ids, title_ids, query_seq_lens, title_seq_lens)
        probs = F.softmax(logits, axis=1)
        idx = paddle.argmax(probs, axis=1).numpy()
        idx = idx.tolist()
        labels = [label_map[i] for i in idx]
        results.extend(labels)
    return results
Beispiel #18
0
def do_predict(data,
               model,
               tokenizer,
               viterbi_decoder,
               tags_to_idx,
               idx_to_tags,
               batch_size=1,
               summary_num=2):

    examples = []
    for text in data:
        example = {"tokens": list(text)}
        input_ids, token_type_ids, seq_len = convert_example(example,
                                                             tokenizer,
                                                             args.max_seq_len,
                                                             is_test=True)

        examples.append((input_ids, token_type_ids, seq_len))

    batches = [
        examples[idx:idx + batch_size]
        for idx in range(0, len(examples), batch_size)
    ]

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'
            ),  # input_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'
            ),  # token_type_ids
        Stack(dtype='int64'),  # seq_len
    ): fn(samples)

    all_pred_tags = []

    model.eval()
    for batch in batches:
        input_ids, token_type_ids, seq_len = batchify_fn(batch)
        input_ids = paddle.to_tensor(input_ids)
        token_type_ids = paddle.to_tensor(token_type_ids)
        seq_len = paddle.to_tensor(seq_len)
        pred_tags = model(input_ids, token_type_ids, lengths=seq_len)
        all_pred_tags.extend(pred_tags.numpy().tolist())
    results = decode(data, all_pred_tags, summary_num, idx_to_tags)
    return results
Beispiel #19
0
def do_train():
    paddle.set_device(args.device)
    rank = paddle.distributed.get_rank()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args.seed)

    dev_ds = load_dataset(read_test, src_path=args.test_file, lazy=False)
    print(dev_ds[0])

    pretrained_model = ppnlp.transformers.ErnieGramModel.from_pretrained(
        'ernie-gram-zh')
    tokenizer = ppnlp.transformers.ErnieGramTokenizer.from_pretrained(
        'ernie-gram-zh')

    trans_func_eval = partial(
        convert_example,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        phase="eval")

    batchify_fn_eval = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # pair_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),  # pair_segment
        Stack(dtype="int64")  # label
    ): [data for data in fn(samples)]

    dev_data_loader = create_dataloader(
        dev_ds,
        mode='dev',
        batch_size=args.batch_size,
        batchify_fn=batchify_fn_eval,
        trans_fn=trans_func_eval)

    model = PairwiseMatching(pretrained_model, margin=args.margin)

    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
        state_dict = paddle.load(args.init_from_ckpt)
        model.set_dict(state_dict)

    metric = paddle.metric.Auc()
    evaluate(model, metric, dev_data_loader, "dev")
Beispiel #20
0
    def preprocess(self, input_dicts, data_id, log_id):
        from paddlenlp.data import Stack, Tuple, Pad

        (_, input_dict), = input_dicts.items()
        print("input dict", input_dict)
        batch_size = len(input_dict.keys())
        examples = []
        for i in range(batch_size):
            input_ids, segment_ids = convert_example([input_dict[str(i)]],
                                                     self.tokenizer)
            examples.append((input_ids, segment_ids))
        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=self.tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=self.tokenizer.pad_token_id),  # segment
        ): fn(samples)
        input_ids, segment_ids = batchify_fn(examples)
        feed_dict = {}
        feed_dict['input_ids'] = input_ids
        feed_dict['token_type_ids'] = segment_ids
        return feed_dict, False, None, ""
Beispiel #21
0
def prepare_train_input(insts, pad_idx):
    """
    Put all padded data needed by training into a list.
    """
    word_pad = Pad(pad_idx)
    src_word = word_pad([inst[0] for inst in insts])
    trg_word = word_pad(inst[1][:-1] for inst in insts)
    lbl_word = word_pad([inst[1][1:] for inst in insts])
    data_inputs = [src_word, trg_word, lbl_word]

    return data_inputs
Beispiel #22
0
def defaut_collator(tokenizer, args):
    """ Defaut collator for sequences classification

    Args:
        tokenizer (PretrainedTokenizer): tokenizer of PretrainedModel
        args : data argument, need label list.

    Returns:
        batchify_fn (function): collator
    """
    batchify_fn = lambda samples, fn=Dict({
        'input_ids':
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input_ids
        "token_type_ids":
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # token_type_ids
        "labels":
        Stack(dtype="int64" if args.label_list else "float32")  # labels
    }): fn(samples)

    return batchify_fn
Beispiel #23
0
def evaluate(args):
    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()
    paddle.set_device("gpu" if args.use_gpu else "cpu")

    # create dataset.
    test_dataset = LacDataset(args.data_dir, mode='test')
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0),  # word_ids
        Stack(),  # length
        Pad(axis=0, pad_val=0),  # label_ids
    ): fn(samples)

    # Create sampler for dataloader
    test_sampler = paddle.io.BatchSampler(dataset=test_dataset,
                                          batch_size=args.batch_size,
                                          shuffle=False,
                                          drop_last=False)
    test_loader = paddle.io.DataLoader(dataset=test_dataset,
                                       batch_sampler=test_sampler,
                                       places=place,
                                       return_list=True,
                                       collate_fn=batchify_fn)

    # Define the model network and metric evaluator
    network = BiGruCrf(args.emb_dim, args.hidden_size, test_dataset.vocab_size,
                       test_dataset.num_labels)
    inputs = InputSpec(shape=(-1, ), dtype="int16", name='inputs')
    lengths = InputSpec(shape=(-1, ), dtype="int16", name='lengths')
    model = paddle.Model(network, inputs=[inputs, lengths])
    chunk_evaluator = ChunkEvaluator(
        label_list=test_dataset.label_vocab.keys(), suffix=True)
    model.prepare(None, None, chunk_evaluator)

    # Load the model and start predicting
    model.load(args.init_checkpoint)
    model.evaluate(
        eval_data=test_loader,
        batch_size=args.batch_size,
        log_freq=100,
        verbose=2,
    )
Beispiel #24
0
    def predict(self,
                data,
                word_vocab,
                label_vocab,
                normlize_vocab,
                batch_size=1):
        """
        Predicts the data labels.

        Args:
            data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object.
                A Example object contains `text`(word_ids) and `seq_len`(sequence length).
            word_vocab(obj:`dict`): The word id (key) to word str (value) map.
            label_vocab(obj:`dict`): The label id (key) to label str (value) map.
            normlize_vocab(obj:`dict`): The fullwidth char (key) to halfwidth char (value) map.
            batch_size(obj:`int`, defaults to 1): The number of batch.

        Returns:
            results(obj:`dict`): All the predictions labels.
        """
        examples = []

        for text in data:
            tokens = list(text.strip())
            token_ids, length = convert_example(
                tokens,
                self.max_seq_length,
                word_vocab=word_vocab,
                normlize_vocab=normlize_vocab)
            examples.append((token_ids, length))

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=0),  # input
            Stack(axis=0),  # length
        ): fn(samples)

        batches = [
            examples[idx:idx + batch_size]
            for idx in range(0, len(examples), batch_size)
        ]

        results = []

        for batch in batches:
            token_ids, length = batchify_fn(batch)
            self.input_handles[0].copy_from_cpu(token_ids)
            self.input_handles[1].copy_from_cpu(length)
            self.predictor.run()
            preds = self.output_handle.copy_to_cpu()
            result = parse_result(token_ids, preds, length, word_vocab,
                                  label_vocab)
            results.extend(result)
        return results
Beispiel #25
0
def evaluate(args):
    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()
    paddle.set_device("gpu" if args.use_gpu else "cpu")

    # create dataset.
    test_dataset = LacDataset(args.data_dir, mode='test')
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0),  # word_ids
        Stack(),  # length
        Pad(axis=0, pad_val=0),  # label_ids
    ): fn(samples)

    # Create sampler for dataloader
    test_sampler = paddle.io.BatchSampler(dataset=test_dataset,
                                          batch_size=args.batch_size,
                                          shuffle=False,
                                          drop_last=True)
    test_loader = paddle.io.DataLoader(dataset=test_dataset,
                                       batch_sampler=test_sampler,
                                       places=place,
                                       return_list=True,
                                       collate_fn=batchify_fn)

    # Define the model network and metric evaluator
    network = BiGruCrf(args.emb_dim, args.hidden_size, test_dataset.vocab_size,
                       test_dataset.num_labels)
    model = paddle.Model(network)
    chunk_evaluator = ChunkEvaluator(
        int(math.ceil((test_dataset.num_labels + 1) / 2.0)),
        "IOB")  # + 1 for SOS and EOS
    model.prepare(None, None, chunk_evaluator)

    # Load the model and start predicting
    model.load(args.init_checkpoint)
    model.evaluate(
        eval_data=test_loader,
        batch_size=args.batch_size,
        log_freq=100,
        verbose=2,
    )
Beispiel #26
0
def create_pair_loader_for_small_model(task_name,
                                       model_name,
                                       vocab_path,
                                       batch_size=64,
                                       max_seq_length=128,
                                       shuffle=True,
                                       is_test=False):
    """Only support QQP now."""
    tokenizer = BertTokenizer.from_pretrained(model_name)
    dataset_class = TASK_CLASSES[task_name]

    train_ds, dev_ds = dataset_class.get_datasets(['train', 'dev'])
    vocab = Vocab.load_vocabulary(
        vocab_path,
        unk_token='[UNK]',
        pad_token='[PAD]',
        bos_token=None,
        eos_token=None,
    )

    trans_func = partial(convert_pair_example,
                         task_name=task_name,
                         vocab=tokenizer,
                         is_tokenized=False,
                         max_seq_length=max_seq_length,
                         is_test=is_test)
    train_ds = train_ds.apply(trans_func, lazy=True)
    dev_ds = dev_ds.apply(trans_func, lazy=True)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=vocab['[PAD]']),  # input
        Stack(),  # length
        Pad(axis=0, pad_val=vocab['[PAD]']),  # input
        Stack(),  # length
        Stack(dtype="int64" if train_ds.get_labels() else "float32")  # label
    ): [data for i, data in enumerate(fn(samples))]

    train_data_loader, dev_data_loader = create_dataloader(
        train_ds, dev_ds, batch_size, batchify_fn, shuffle)
    return train_data_loader, dev_data_loader
Beispiel #27
0
def get_train_dataloader(tokenizer, args):
    splits = "train"
    data_dir = args.data_dir
    filename = os.path.join(data_dir, "cmrc2018_" + splits + ".pkl")

    if os.path.exists(filename):
        ds = load_pickle(filename)
    else:
        ds = load_dataset("cmrc2018", splits=splits)
        ds.map(
            partial(prepare_train_features_paddlenlp,
                    tokenizer=tokenizer,
                    args=args),
            batched=True,
            lazy=False,
        )
        save_pickle(ds, filename)

    batch_sampler = BatchSampler(ds,
                                 batch_size=args.train_batch_size,
                                 shuffle=True)

    batchify_fn = lambda samples, fn=Dict(
        {
            "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
            "token_type_ids": Pad(axis=0, pad_val=0),
            "pinyin_ids": Pad(axis=0, pad_val=0),
            "start_positions": Stack(dtype="int64"),
            "end_positions": Stack(dtype="int64"),
        }): fn(samples)

    data_loader = DataLoader(
        dataset=ds,
        batch_sampler=batch_sampler,
        collate_fn=batchify_fn,
        num_workers=args.num_workers,
        return_list=True,
    )

    return data_loader
Beispiel #28
0
def get_mnli_dev_dataloader(tokenizer, args, matched=True):
    if matched:
        split = "dev_matched"
    else:
        split = "dev_mismatched"
    filename = os.path.join("caches", args.task_name + f"_{split}" + ".pkl")
    if os.path.exists(filename):
        ds = load_pickle(filename)
    else:
        ds = load_dataset("glue", args.task_name, splits=split)
        ds.map(
            partial(trans_func, tokenizer=tokenizer, args=args),
            batched=False,
            lazy=False,
        )
        save_pickle(ds, filename)

    batch_sampler = BatchSampler(ds,
                                 batch_size=args.train_batch_size,
                                 shuffle=False)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
            ),  # input_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
            ),  # attention_mask
        Pad(axis=0, pad_val=-100, dtype="int64"),  # lm_labels
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
            ),  # decoder_attention_mask
    ): fn(samples)

    data_loader = DataLoader(
        dataset=ds,
        batch_sampler=batch_sampler,
        collate_fn=batchify_fn,
        num_workers=args.num_workers,
        return_list=True,
    )

    return data_loader
Beispiel #29
0
        def preprocess_fn(data):
            examples = []

            if not isinstance(data, list):
                data = [data]

            for text in data:
                input_ids, segment_ids = convert_example(text,
                                                         tokenizer,
                                                         max_seq_length=128,
                                                         is_test=True)
                examples.append((input_ids, segment_ids))

            batchify_fn = lambda samples, fn=Tuple(
                Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input id
                Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment id
            ): fn(samples)

            input_ids, segment_ids = batchify_fn(examples)
            return paddle.to_tensor(input_ids,
                                    stop_gradient=False), paddle.to_tensor(
                                        segment_ids, stop_gradient=False)
Beispiel #30
0
def prepare_train_input(insts, bos_idx, eos_idx, pad_idx):
    """
    Put all padded data needed by training into a list.
    """
    word_pad = Pad(pad_idx)
    src_word = word_pad([inst[0] + [eos_idx] for inst in insts])
    trg_word = word_pad([[bos_idx] + inst[1] for inst in insts])
    lbl_word = np.expand_dims(
        word_pad([inst[1] + [eos_idx] for inst in insts]), axis=2)

    data_inputs = [src_word, trg_word, lbl_word]

    return data_inputs