Esempio n. 1
0
def init_lstm_var(args):
    if args.language == 'ch':
        vocab = Vocab.load_vocabulary("../task/similarity/simnet/vocab.char",
                                      unk_token='[UNK]',
                                      pad_token='[PAD]')
    else:
        vocab = Vocab.load_vocabulary("../task/similarity/simnet/vocab_QQP",
                                      unk_token='[UNK]',
                                      pad_token='[PAD]')

    tokenizer = CharTokenizer(vocab, args.language, '../punctuations')
    model = SimNet(network='lstm', vocab_size=len(vocab), num_classes=2)

    dev_ds = SimilarityData().read(os.path.join(args.data_dir, 'dev'))
    dev_examples = preprocess_data(dev_ds.data,
                                   tokenizer,
                                   language=args.language)
    batches = [
        dev_examples[idx:idx + args.batch_size]
        for idx in range(0, len(dev_examples), args.batch_size)
    ]

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=vocab.token_to_idx.get('[PAD]', 0)),  # query_ids
        Pad(axis=0, pad_val=vocab.token_to_idx.get('[PAD]', 0)),  # title_ids
        Stack(dtype="int64"),  # query_seq_lens
        Stack(dtype="int64"),  # title_seq_lens
    ): [data for data in fn(samples)]

    return model, tokenizer, batches, batchify_fn, vocab, dev_ds
Esempio n. 2
0
    def build_dataset(index, name, num_samples):
        dataset = GPTDataset(file_path=input_path,
                             build_data_file=local_rank == 0,
                             name="gpt_" + name,
                             max_seq_len=max_seq_len,
                             num_samples=num_samples,
                             documents=np.arange(splits[index],
                                                 splits[index + 1]),
                             sample_ids=sample_ids,
                             sample_lens=sample_lens,
                             eos_id=eos_id,
                             seed=args.seed)

        batch_sampler = DistributedBatchSampler(
            dataset,
            batch_size=args.local_batch_size,
            num_replicas=data_world_size,
            rank=data_world_rank,
            shuffle=False,
            drop_last=True)

        data_loader = DataLoader(
            dataset=dataset,
            places=places,
            feed_list=data_holders,
            batch_sampler=batch_sampler,
            num_workers=1,
            worker_init_fn=worker_init,
            # collate_fn=Tuple(Stack(), Stack(), Stack(), Stack(), Stack()),
            collate_fn=Tuple(Stack(), Stack(), Stack()),
            return_list=False)
        return data_loader
Esempio n. 3
0
def create_dataloader(dataset,
                      trans_fn=None,
                      mode='train',
                      batch_size=1,
                      pad_token_id=0):
    """
    Creats dataloader.
    Args:
        dataset(obj:`paddle.io.Dataset`): Dataset instance.
        mode(obj:`str`, optional, defaults to obj:`train`): If mode is 'train', it will shuffle the dataset randomly.
        batch_size(obj:`int`, optional, defaults to 1): The sample number of a mini-batch.
        pad_token_id(obj:`int`, optional, defaults to 0): The pad token index.
    Returns:
        dataloader(obj:`paddle.io.DataLoader`): The dataloader which generates batches.
    """
    if trans_fn:
        dataset = dataset.map(trans_fn, lazy=True)

    shuffle = True if mode == 'train' else False
    sampler = paddle.io.BatchSampler(dataset=dataset,
                                     batch_size=batch_size,
                                     shuffle=shuffle)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=vocab.get('[PAD]', 0)),  # input_ids
        Stack(dtype="int32"),  # seq len
        Stack(dtype="int64")  # label
    ): [data for data in fn(samples)]

    dataloader = paddle.io.DataLoader(dataset,
                                      batch_sampler=sampler,
                                      return_list=True,
                                      collate_fn=batchify_fn)
    return dataloader
Esempio n. 4
0
 def _test_impl(self, list_fn=True):
     if list_fn:
         batchify_fn = Tuple([Stack(), Pad(axis=0, pad_val=0)])
     else:
         batchify_fn = Tuple(Stack(), Pad(axis=0, pad_val=0))
     result = batchify_fn(self.input)
     self.check_output_equal(result[0], self.expected_result[0])
     self.check_output_equal(result[1], self.expected_result[1])
Esempio n. 5
0
def qa_collator(tokenizer, args):
    train_batchify_fn = lambda samples, fn=Dict(
        {
            "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
            "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
            "start_positions": Stack(dtype="int64"),
            "end_positions": Stack(dtype="int64")
        }): fn(samples)

    return train_batchify_fn
def interpret(model, data, label_map, batch_size=1, pad_token_id=0, vocab=None):
    """
    Predicts the data labels.

    Args:
        model (obj:`paddle.nn.Layer`): A model to classify texts.
        data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object.
            A Example object contains `text`(word_ids) and `seq_len`(sequence length).
        label_map(obj:`dict`): The label id (key) to label str (value) map.
        batch_size(obj:`int`, defaults to 1): The number of batch.
        pad_token_id(obj:`int`, optional, defaults to 0): The pad token index.

    Returns:
        results(obj:`dict`): All the predictions labels.
    """

    # Seperates data into some batches.
    batches = [
        data[idx:idx + batch_size] for idx in range(0, len(data), batch_size)
    ]

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=pad_token_id),  # query_ids
        Pad(axis=0, pad_val=pad_token_id),  # title_ids
        Stack(dtype="int64"),  # query_seq_lens
        Stack(dtype="int64"),  # title_seq_lens
    ): [data for data in fn(samples)]

    model.eval()
    results = []
    for batch in batches:
        query_ids, title_ids, query_seq_lens, title_seq_lens = batchify_fn(
            batch)
        query_ids = paddle.to_tensor(query_ids)
        title_ids = paddle.to_tensor(title_ids)
        query_seq_lens = paddle.to_tensor(query_seq_lens)
        title_seq_lens = paddle.to_tensor(title_seq_lens)

        logits, attention, _ = model.forward_interpret(
            query_ids, title_ids, query_seq_lens, title_seq_lens)
        query_att = attention[0]
        title_att = attention[1]

        model.clear_gradients()
        for query_id, title_id in zip(query_ids.numpy().tolist(),
                                      title_ids.numpy().tolist()):
            query = [vocab._idx_to_token[idx] for idx in query_id]
            title = [vocab._idx_to_token[idx] for idx in title_id]
        results.append([query_att, query, title_att, title])

        print('query_att: %s' % query_att.shape)
        print('title_att: %s' % title_att.shape)

    return results
Esempio n. 7
0
def main():
    paddle.seed(42)
    args = parse_args()

    args.task_name = args.task_name.lower()
    args.model_type = args.model_type.lower()

    predictor = Predictor.create_predictor(args)

    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

    dev_ds = load_dataset('clue', args.task_name, splits='dev')

    if not args.use_faster_tokenizer:
        tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    else:
        trans_func = partial(convert_example,
                             label_list=dev_ds.label_list,
                             is_test=False)
        dev_ds = dev_ds.map(trans_func, lazy=True)
    if not args.use_faster_tokenizer:
        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
            Stack(dtype="int64" if dev_ds.label_list else "float32")  # label
        ): fn(samples)
        outputs = predictor.predict(dev_ds, tokenizer, batchify_fn, args)
    else:
        outputs = predictor.faster_predict(dev_ds, args=args)
def init_lstm_var(args):
    vocab = Vocab.load_vocabulary(args.vocab_path,
                                  unk_token='[UNK]',
                                  pad_token='[PAD]')
    tokenizer = CharTokenizer(vocab, args.language, '../../punctuations')
    padding_idx = vocab.token_to_idx.get('[PAD]', 0)

    trans_fn = partial(convert_example,
                       tokenizer=tokenizer,
                       is_test=True,
                       language=args.language)

    # Init attention layer
    lstm_hidden_size = 196
    attention = SelfInteractiveAttention(hidden_size=2 * lstm_hidden_size)
    model = BiLSTMAttentionModel(attention_layer=attention,
                                 vocab_size=len(tokenizer.vocab),
                                 lstm_hidden_size=lstm_hidden_size,
                                 num_classes=2,
                                 padding_idx=padding_idx)

    # Reads data and generates mini-batches.
    dev_ds = Senti_data().read(args.data_dir)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=padding_idx),  # input_ids
        Stack(dtype="int64"),  # seq len
    ): [data for data in fn(samples)]

    dev_loader = create_dataloader(dev_ds,
                                   trans_fn=trans_fn,
                                   batch_size=args.batch_size,
                                   mode='validation',
                                   batchify_fn=batchify_fn)

    return model, tokenizer, dev_loader
Esempio n. 9
0
    def predict_cls(self, args, ext_results):
        test_ds = MapDataset(ext_results)
        trans_func = partial(convert_example_to_feature_cls,
                             tokenizer=self.tokenizer,
                             label2id=self.cls_label2id,
                             max_seq_len=args.cls_max_seq_len,
                             is_test=True)
        test_ds = test_ds.map(trans_func, lazy=False)
        batch_list = [
            test_ds[idx:idx + args.batch_size]
            for idx in range(0, len(test_ds), args.batch_size)
        ]

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64"),
            Pad(axis=0,
                pad_val=self.tokenizer.pad_token_type_id,
                dtype="int64"), Stack(dtype="int64")): fn(samples)

        results = []
        for batch_data in batch_list:
            input_ids, token_type_ids, _ = batchify_fn(batch_data)
            self.cls_input_handles[0].copy_from_cpu(input_ids)
            self.cls_input_handles[1].copy_from_cpu(token_type_ids)
            self.cls_predictor.run()
            logits = self.cls_output_hanle.copy_to_cpu()

            predictions = logits.argmax(axis=1).tolist()
            results.extend(predictions)

        return results
Esempio n. 10
0
def main():
    args = parse_args()

    predictor = Predictor.create_predictor(args)

    args.task_name = args.task_name.lower()
    dataset_class, metric_class = TASK_CLASSES[args.task_name]
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

    dataset = dataset_class.get_datasets("test")
    tokenizer = tokenizer_class.from_pretrained(
        os.path.dirname(args.model_path))
    transform_fn = partial(convert_example,
                           tokenizer=tokenizer,
                           label_list=dataset.get_labels(),
                           max_seq_length=args.max_seq_length,
                           is_test=True)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
        Stack(),  # length
    ): [data for i, data in enumerate(fn(samples)) if i != 2]
    dataset = dataset.apply(transform_fn)

    predictor.predict(dataset,
                      batch_size=args.batch_size,
                      collate_fn=batchify_fn)
Esempio n. 11
0
 def _collate_data(data, stack_fn=Stack()):
     num_fields = len(data[0])
     out = [None] * num_fields
     # input_ids, segment_ids, input_mask, masked_lm_positions,
     # masked_lm_labels, next_sentence_labels, mask_token_num
     for i in (0, 1, 2, 5):
         out[i] = stack_fn([x[i] for x in data])
     batch_size, seq_length = out[0].shape
     size = num_mask = sum(len(x[3]) for x in data)
     # Padding for divisibility by 8 for fp16 or int8 usage
     if size % 8 != 0:
         size += 8 - (size % 8)
     # masked_lm_positions
     # Organize as a 1D tensor for gather or use gather_nd
     out[3] = np.full(size, 0, dtype=np.int32)
     # masked_lm_labels
     out[4] = np.full([size, 1], -1, dtype=np.int64)
     mask_token_num = 0
     for i, x in enumerate(data):
         for j, pos in enumerate(x[3]):
             out[3][mask_token_num] = i * seq_length + pos
             out[4][mask_token_num] = x[4][j]
             mask_token_num += 1
     # mask_token_num
     out.append(np.asarray([mask_token_num], dtype=np.float32))
     if args.use_amp and args.use_pure_fp16:
         # cast input_mask to fp16
         out[2] = out[2].astype(np.float16)
         # cast masked_lm_scale to fp16
         out[-1] = out[-1].astype(np.float16)
     return out
def news_title_classification():
    """文本分类---新闻标题分类"""
    data = json.loads(request.data)
    if isinstance(data["texts"], str):
        data["texts"] = [data["texts"]]

    if isinstance(data["texts"], list):
        datasets = []
        for text in data["texts"]:
            datasets.append({"text": text})
        datasets = MapDataset(datasets)
        trans_func = partial(convert_example,
                             tokenizer=tokenizer,
                             max_seq_length=64)
        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
            Stack(dtype="int64")  # label
        ): fn(samples)
        data_loader = create_dataloader(datasets,
                                        mode="test",
                                        batch_size=32,
                                        batchify_fn=batchify_fn,
                                        trans_fn=trans_func)

        labels = inference(model, data_loader)
        labels_text = []
        for id, label in enumerate(labels):
            labels_text.append(labels_info[str(label)])

        return jsonify(status="Success", results=labels_text)
    else:
        return jsonify(status="Failure",
                       message="Incorrect parameter data type.")
Esempio n. 13
0
    def _collate_data(data, stack_fn=Stack()):
        # Data Fields: input_ids, segment_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights, next_sentence_labels
        num_fields = len(data[0])
        out = [None] * num_fields

        for i in [0, 1, 5]:
            out[i] = stack_fn([x[i] for x in data])
        batch_size, seq_length = out[0].shape
        size = num_mask = sum(len(x[2]) for x in data)
        out[2] = np.full(size, 0, dtype=np.int32)
        # masked_lm_labels
        out[3] = np.full([size, 1], -1, dtype=np.int64)
        # masked weight
        out[4] = np.full([size], 0, dtype="float32")
        # # Organize as a 1D tensor for gather or use gather_nd
        mask_token_num = 0
        for i, x in enumerate(data):
            for j, pos in enumerate(x[2]):
                out[2][mask_token_num] = i * seq_length + pos
                out[3][mask_token_num] = x[3][j]
                out[4][mask_token_num] = x[4][j]
                mask_token_num += 1
        out.append(np.asarray([mask_token_num], dtype=np.float32))
        seq_len = len(out[0][0])
        rand_mask_idx_list = create_bigbird_rand_mask_idx_list(
            config["num_layers"], seq_len, seq_len, config["nhead"],
            config["block_size"], config["window_size"],
            config["num_global_blocks"], config["num_rand_blocks"],
            config["seed"])
        out.extend(rand_mask_idx_list)
        return out
Esempio n. 14
0
def init_roberta_var(args):
    if args.language == 'ch':
        tokenizer = RobertaTokenizer.from_pretrained(args.from_pretrained)
    else:
        tokenizer = RobertaBPETokenizer.from_pretrained(args.from_pretrained)

    model = RobertaForQuestionAnswering.from_pretrained(
        args.from_pretrained, num_classes=args.num_classes)
    map_fn = partial(map_fn_DuCheckList, args=args, tokenizer=tokenizer)
    dev_ds = RCInterpret().read(args.data_dir)

    dev_ds.map(map_fn, batched=True)
    dev_batch_sampler = paddle.io.BatchSampler(dev_ds,
                                               batch_size=args.batch_size,
                                               shuffle=False)
    batchify_fn = lambda samples, fn=Dict(
        {
            "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
            "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
            "offset_mapping": Pad(axis=0, pad_val=tokenizer.pad_token_id),
            "overflow_to_sample": Stack(dtype='int32'),
        }): fn(samples)

    dev_dataloader = paddle.io.DataLoader(dataset=dev_ds,
                                          batch_sampler=dev_batch_sampler,
                                          collate_fn=batchify_fn,
                                          return_list=True)

    return model, tokenizer, dev_dataloader, dev_ds
Esempio n. 15
0
    def _pre_process_text(self, input_texts, max_seq_len=128, batch_size=1):
        infer_data = []
        max_predict_len = max_seq_len - self.summary_num - 1
        short_input_texts = self._split_long_text2short_text_list(
            input_texts, max_predict_len)
        for text in short_input_texts:
            tokens = ["[CLS%i]" % i
                      for i in range(1, self.summary_num)] + list(text)
            tokenized_input = self._tokenizer(tokens,
                                              return_length=True,
                                              is_split_into_words=True,
                                              max_seq_len=max_seq_len)
            infer_data.append([
                tokenized_input['input_ids'],
                tokenized_input['token_type_ids'], tokenized_input['seq_len']
            ])
        infer_ds = MapDataset(infer_data)

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=self._tokenizer.pad_token_id, dtype='int64'
                ),  # input_ids
            Pad(axis=0,
                pad_val=self._tokenizer.pad_token_type_id,
                dtype='int64'),  # token_type_ids
            Stack(dtype='int64'),  # seq_len
        ): fn(samples)

        infer_data_loader = paddle.io.DataLoader(infer_ds,
                                                 collate_fn=batchify_fn,
                                                 num_workers=0,
                                                 batch_size=batch_size,
                                                 shuffle=False,
                                                 return_list=True)

        return infer_data_loader, short_input_texts
Esempio n. 16
0
    def _collate_data(data, stack_fn=Stack()):
        num_fields = len(data[0])
        out = [None] * num_fields
        # 0. input_ids,
        # 1. segment_ids,
        # 2. input_mask,
        # 3. masked_lm_positions,
        # 4. masked_lm_labels,
        # 5. next_sentence_labels
        for i in (0, 1, 2, 5):
            out[i] = stack_fn([x[i] for x in data])
        out[5] = out[5].reshape([-1, 1])
        batch_size, seq_length = out[0].shape
        size = num_mask = sum(len(x[3]) for x in data)
        # masked_lm_positions
        # Organize as a 1D tensor for gather or use gather_nd
        if size % 8 != 0:
            size += 8 - (size % 8)
        out[3] = np.full(size, 0, dtype=np.int32)
        # masked_lm_labels
        out[4] = np.full([size, 1], -1, dtype=np.int64)
        mask_token_num = 0
        for i, x in enumerate(data):
            for j, pos in enumerate(x[3]):
                out[3][mask_token_num] = i * seq_length + pos
                out[4][mask_token_num] = x[4][j]
                mask_token_num += 1

        return out
Esempio n. 17
0
def do_eval(args):
    paddle.set_device(args.device)

    # Create dataset, tokenizer and dataloader.
    train_ds, eval_ds = load_dataset('msra_ner',
                                     splits=('train', 'test'),
                                     lazy=False)
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

    label_list = train_ds.label_list
    label_num = len(label_list)
    no_entity_id = label_num - 1
    trans_func = partial(tokenize_and_align_labels,
                         tokenizer=tokenizer,
                         no_entity_id=no_entity_id,
                         max_seq_len=args.max_seq_length)
    ignore_label = -100
    batchify_fn = lambda samples, fn=Dict({
        'input_ids':
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'),  # input
        'token_type_ids':
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32'
            ),  # segment
        'seq_len':
        Stack(dtype='int64'),
        'labels':
        Pad(axis=0, pad_val=ignore_label, dtype='int64')  # label
    }): fn(samples)
    eval_ds = eval_ds.map(trans_func)
    eval_data_loader = DataLoader(dataset=eval_ds,
                                  collate_fn=batchify_fn,
                                  num_workers=0,
                                  batch_size=args.batch_size,
                                  return_list=True)

    # Define the model netword and its loss
    model = BertForTokenClassification.from_pretrained(args.model_name_or_path,
                                                       num_classes=label_num)
    if args.init_checkpoint_path:
        model_dict = paddle.load(args.init_checkpoint_path)
        model.set_dict(model_dict)
    loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)

    metric = ChunkEvaluator(label_list=label_list)

    model.eval()
    metric.reset()
    for step, batch in enumerate(eval_data_loader):
        input_ids, token_type_ids, length, labels = batch
        logits = model(input_ids, token_type_ids)
        loss = loss_fct(logits, labels)
        avg_loss = paddle.mean(loss)
        preds = logits.argmax(axis=2)
        num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute(
            length, preds, labels)
        metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(),
                      num_correct_chunks.numpy())
        precision, recall, f1_score = metric.accumulate()
    print("eval loss: %f, precision: %f, recall: %f, f1: %f" %
          (avg_loss, precision, recall, f1_score))
Esempio n. 18
0
def infer(args):
    paddle.set_device(args.device)

    # create dataset.
    infer_dataset = LacDataset(args.data_dir, mode='infer')

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0, dtype='int64'),  # word_ids
        Stack(dtype='int64'),  # length
    ): fn(samples)

    # Create sampler for dataloader
    infer_sampler = paddle.io.BatchSampler(
        dataset=infer_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        drop_last=False)
    infer_loader = paddle.io.DataLoader(
        dataset=infer_dataset,
        batch_sampler=infer_sampler,
        return_list=True,
        collate_fn=batchify_fn)

    # Define the model network
    network = BiGruCrf(args.emb_dim, args.hidden_size, infer_dataset.vocab_size,
                       infer_dataset.num_labels)
    inputs = InputSpec(shape=(-1, ), dtype="int64", name='inputs')
    lengths = InputSpec(shape=(-1, ), dtype="int64", name='lengths')
    model = paddle.Model(network, inputs=[inputs, lengths])
    model.prepare()

    # Load the model and start predicting
    model.load(args.init_checkpoint)
    emissions, lengths, crf_decodes = model.predict(
        test_data=infer_loader, batch_size=args.batch_size)

    # Post-processing the lexical analysis results
    lengths = np.array([l for lens in lengths for l in lens]).reshape([-1])
    preds = np.array(
        [pred for batch_pred in crf_decodes for pred in batch_pred])

    results = parse_lac_result(infer_dataset.word_ids, preds, lengths,
                               infer_dataset.word_vocab,
                               infer_dataset.label_vocab)

    sent_tags = []
    for sent, tags in results:
        sent_tag = ['(%s, %s)' % (ch, tag) for ch, tag in zip(sent, tags)]
        sent_tags.append(''.join(sent_tag))

    file_path = "results.txt"
    with open(file_path, "w", encoding="utf8") as fout:
        fout.write("\n".join(sent_tags))

    # Print some examples
    print(
        "The results have been saved in the file: %s, some examples are shown below: "
        % file_path)
    print("\n".join(sent_tags[:10]))
Esempio n. 19
0
def create_pretrained_dataset(args, input_path, worker_init, worker_index,
                              eod_id):
    train_data = GPT2Dataset(file_path=input_path,
                             worker_index=worker_index,
                             num_samples=args.batch_size * args.max_steps,
                             eod_id=eod_id,
                             seed=args.seed + worker_index)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_data, batch_size=args.batch_size, shuffle=True, drop_last=True)

    train_data_loader = DataLoader(dataset=train_data,
                                   batch_sampler=train_batch_sampler,
                                   num_workers=0,
                                   worker_init_fn=worker_init,
                                   collate_fn=Tuple(Stack(), Stack(), Stack(),
                                                    Stack(), Stack()))
    return train_data_loader
Esempio n. 20
0
def create_data_loader_for_small_model(task_name,
                                       vocab_path,
                                       model_name=None,
                                       batch_size=64,
                                       max_seq_length=128,
                                       shuffle=True):
    """Data loader for bi-lstm, not bert."""
    if task_name == 'chnsenticorp':
        train_ds, dev_ds = load_dataset(task_name, splits=["train", "dev"])
    else:
        train_ds, dev_ds = load_dataset('glue',
                                        task_name,
                                        splits=["train", "dev"])
    if task_name == 'chnsenticorp':
        vocab = Vocab.load_vocabulary(
            vocab_path,
            unk_token='[UNK]',
            pad_token='[PAD]',
            bos_token=None,
            eos_token=None,
        )
        pad_val = vocab['[PAD]']

    else:
        vocab = BertTokenizer.from_pretrained(model_name)
        pad_val = vocab.pad_token_id

    trans_fn = partial(convert_small_example,
                       task_name=task_name,
                       vocab=vocab,
                       max_seq_length=max_seq_length,
                       is_test=False)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=pad_val),  # input_ids
        Stack(dtype="int64"),  # seq len
        Stack(dtype="int64")  # label
    ): fn(samples)

    train_ds = train_ds.map(trans_fn, lazy=True)
    dev_ds = dev_ds.map(trans_fn, lazy=True)

    train_data_loader, dev_data_loader = create_dataloader(
        train_ds, dev_ds, batch_size, batchify_fn, shuffle)

    return train_data_loader, dev_data_loader
Esempio n. 21
0
def do_predict():
    paddle.set_device(args.device)

    tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0")
    label_map = load_dict(args.tag_path)
    id2label = {val: key for key, val in label_map.items()}
    model = ErnieForTokenClassification.from_pretrained("ernie-1.0", num_classes=len(label_map))

    no_entity_label = "O"
    ignore_label = len(label_map)

    print("============start predict==========")
    if not args.init_ckpt or not os.path.isfile(args.init_ckpt):
        raise Exception("init checkpoints {} not exist".format(args.init_ckpt))
    else:
        state_dict = paddle.load(args.init_ckpt)
        model.set_dict(state_dict)
        print("Loaded parameters from %s" % args.init_ckpt)

    # load data from predict file
    sentences = read_by_lines(args.predict_data) # origin data format
    sentences = [json.loads(sent) for sent in sentences]

    encoded_inputs_list = []
    for sent in sentences:
        sent = sent["text"].replace(" ", "\002")
        input_ids, token_type_ids, seq_len = convert_example_to_feature([list(sent), []], tokenizer,
                    max_seq_len=args.max_seq_len, is_test=True)
        encoded_inputs_list.append((input_ids, token_type_ids, seq_len))

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # input_ids
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # token_type_ids
        Stack(dtype='int64') # sequence lens
    ): fn(samples)
    # Seperates data into some batches.
    batch_encoded_inputs = [encoded_inputs_list[i: i + args.batch_size]
                            for i in range(0, len(encoded_inputs_list), args.batch_size)]
    results = []
    model.eval()
    for batch in batch_encoded_inputs:
        input_ids, token_type_ids, seq_lens = batchify_fn(batch)
        input_ids = paddle.to_tensor(input_ids)
        token_type_ids = paddle.to_tensor(token_type_ids)
        logits = model(input_ids, token_type_ids)
        probs = F.softmax(logits, axis=-1)
        probs_ids = paddle.argmax(probs, -1).numpy()
        probs = probs.numpy()
        for p_list, p_ids, seq_len in zip(probs.tolist(), probs_ids.tolist(), seq_lens.tolist()):
            prob_one = [p_list[index][pid] for index, pid in enumerate(p_ids[1: seq_len - 1])]
            label_one = [id2label[pid] for pid in p_ids[1: seq_len - 1]]
            results.append({"probs": prob_one, "labels": label_one})
    assert len(results) == len(sentences)
    for sent, ret in zip(sentences, results):
        sent["pred"] = ret
    sentences = [json.dumps(sent, ensure_ascii=False) for sent in sentences]
    write_by_lines(args.predict_save_path, sentences)
    print("save data {} to {}".format(len(sentences), args.predict_save_path))
Esempio n. 22
0
def evaluate(args):
    paddle.set_device(args.device)

    # create dataset.
    test_ds = load_dataset(datafiles=(os.path.join(args.data_dir, 'test.tsv')))
    word_vocab = load_vocab(os.path.join(args.data_dir, 'word.dic'))
    label_vocab = load_vocab(os.path.join(args.data_dir, 'tag.dic'))
    # q2b.dic is used to replace DBC case to SBC case
    normlize_vocab = load_vocab(os.path.join(args.data_dir, 'q2b.dic'))

    trans_func = partial(
        convert_example,
        max_seq_len=args.max_seq_len,
        word_vocab=word_vocab,
        label_vocab=label_vocab,
        normlize_vocab=normlize_vocab)
    test_ds.map(trans_func)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0, dtype='int64'),  # word_ids
        Stack(dtype='int64'),  # length
        Pad(axis=0, pad_val=0, dtype='int64'),  # label_ids
    ): fn(samples)

    # Create sampler for dataloader
    test_sampler = paddle.io.BatchSampler(
        dataset=test_ds,
        batch_size=args.batch_size,
        shuffle=False,
        drop_last=False)
    test_loader = paddle.io.DataLoader(
        dataset=test_ds,
        batch_sampler=test_sampler,
        return_list=True,
        collate_fn=batchify_fn)

    # Define the model network and metric evaluator
    model = BiGruCrf(args.emb_dim, args.hidden_size,
                     len(word_vocab), len(label_vocab))
    chunk_evaluator = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True)

    # Load the model and start predicting
    model_dict = paddle.load(args.init_checkpoint)
    model.load_dict(model_dict)

    model.eval()
    chunk_evaluator.reset()
    for batch in test_loader:
        token_ids, length, labels = batch
        preds = model(token_ids, length)
        num_infer_chunks, num_label_chunks, num_correct_chunks = chunk_evaluator.compute(
            length, preds, labels)
        chunk_evaluator.update(num_infer_chunks.numpy(),
                               num_label_chunks.numpy(),
                               num_correct_chunks.numpy())
        precision, recall, f1_score = chunk_evaluator.accumulate()
    print("eval precision: %f, recall: %f, f1: %f" %
          (precision, recall, f1_score))
Esempio n. 23
0
def load_squad_dataset(args):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    features_fn = prepare_train_features if args.is_training else prepare_validation_features
    if args.is_training:
        raw_dataset = load_dataset('squad', split='train')
    else:
        raw_dataset = load_dataset('squad', split='validation')
    column_names = raw_dataset.column_names
    dataset = raw_dataset.map(partial(
        features_fn, tokenizer=tokenizer, args=args),
                              batched=True,
                              remove_columns=column_names,
                              num_proc=4)

    bs = args.micro_batch_size * args.grad_acc_factor * args.batches_per_step * args.num_replica
    args.batch_size = bs
    if args.is_training:
        train_batch_sampler = BatchSampler(
            dataset, batch_size=bs, shuffle=args.shuffle, drop_last=True)
    else:
        train_batch_sampler = BatchSampler(
            dataset, batch_size=bs, shuffle=args.shuffle, drop_last=False)

    if args.is_training:
        collate_fn = lambda samples, fn=Dict({
            "input_ids": Stack(),
            "token_type_ids": Stack(),
            "position_ids": Stack(),
            "input_mask": Stack(),
            "start_positions": Stack(),
            "end_positions": Stack()
        }): fn(samples)
    else:
        collate_fn = lambda samples, fn=Dict({
            "input_ids": Stack(),
            "token_type_ids": Stack(),
            "position_ids": Stack(),
            "input_mask": Stack()}): fn(samples)

    data_loader = DataLoader(
        dataset=dataset,
        batch_sampler=train_batch_sampler,
        collate_fn=collate_fn,
        return_list=True)
    return raw_dataset, data_loader
Esempio n. 24
0
    def predict(self,
                data,
                tokenizer,
                label_map,
                batch_size=1,
                network="bilstm"):
        """
        Predicts the data labels.

        Args:
            model (obj:`paddle.nn.Layer`): A model to classify texts.
            data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object.
                A Example object contains `text`(word_ids) and `se_len`(sequence length).
            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from 
                :class:`~paddlenlp.transformers.PretrainedTokenizer` which contains most of the methods.
                 Users should refer to the superclass for more information regarding methods.
            label_map(obj:`dict`): The label id (key) to label str (value) map.
            batch_size(obj:`int`, defaults to 1): The number of batch.

        Returns:
            results(obj:`dict`): All the predictions labels.
        """
        examples = []
        for text in data:
            input_id, seq_len = preprocess_prediction_data(text, tokenizer)
            examples.append((input_id, seq_len))

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.vocab.token_to_idx.get("[PAD]", 0)
                ),  # input_id
            Stack()  # seq_len
        ): fn(samples)

        # Seperates data into some batches.
        batches = [
            examples[idx:idx + batch_size]
            for idx in range(0, len(examples), batch_size)
        ]

        results = []
        for batch in batches:
            input_ids, seq_lens = batchify_fn(batch)
            self.input_handles[0].copy_from_cpu(input_ids)
            if network in [
                    "lstm", "bilstm", "gru", "bigru", "rnn", "birnn",
                    "bilstm_attn"
            ]:
                self.input_handles[1].copy_from_cpu(seq_lens)
            self.predictor.run()
            logits = self.output_handle.copy_to_cpu()
            probs = softmax(logits, axis=1)
            print(probs)
            idx = np.argmax(probs, axis=1)
            idx = idx.tolist()
            labels = [label_map[i] for i in idx]
            results.extend(labels)
        return results
Esempio n. 25
0
def do_eval(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)

    args.task_name = args.task_name.lower()
    metric_class = METRIC_CLASSES[args.task_name]
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

    dev_ds = load_dataset('clue', args.task_name, splits='dev')

    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    trans_func = partial(convert_example,
                         label_list=dev_ds.label_list,
                         tokenizer=tokenizer,
                         max_seq_length=args.max_seq_length)

    dev_ds = dev_ds.map(trans_func, lazy=True)
    dev_batch_sampler = paddle.io.BatchSampler(dev_ds,
                                               batch_size=args.batch_size,
                                               shuffle=False)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
        Stack(dtype="int64" if dev_ds.label_list else "float32")  # label
    ): fn(samples)

    dev_data_loader = DataLoader(dataset=dev_ds,
                                 batch_sampler=dev_batch_sampler,
                                 collate_fn=batchify_fn,
                                 num_workers=0,
                                 return_list=True)

    num_classes = 1 if dev_ds.label_list == None else len(dev_ds.label_list)

    model = model_class.from_pretrained(args.model_name_or_path,
                                        num_classes=num_classes)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    metric = metric_class()
    best_acc = 0.0
    global_step = 0
    tic_train = time.time()
    model.eval()
    metric.reset()
    for batch in dev_data_loader:
        input_ids, segment_ids, labels = batch
        logits = model(input_ids, segment_ids)
        correct = metric.compute(logits, labels)
        metric.update(correct)
    res = metric.accumulate()
    print("acc: %s\n, " % (res), end='')
Esempio n. 26
0
def collect_data(samples, dataset, config):
    stack_fn = Stack(dtype="int64" if dataset.label_list else "float32")
    stack_fn1 = Stack()

    num_fields = len(samples[0])
    out = [None] * num_fields
    out[0] = stack_fn1([x[0] for x in samples])  # input_ids
    out[1] = stack_fn1([x[1] for x in samples])  # token_type_ids
    if num_fields >= 2:
        out[2] = stack_fn(x[2] for x in samples)  # labels
    seq_len = len(out[0][0])
    # Construct the random attention mask for the random attention
    rand_mask_idx_list = create_bigbird_rand_mask_idx_list(
        config["num_layers"], seq_len, seq_len, config["nhead"],
        config["block_size"], config["window_size"],
        config["num_global_blocks"], config["num_rand_blocks"], config["seed"])
    out.extend(rand_mask_idx_list)
    return out
Esempio n. 27
0
    def build_dataset(index, name, num_samples):
        dataset = GPTDataset(file_prefix=input_prefix,
                             build_data_file=local_rank == 0,
                             micro_batch_size=args.micro_batch_size,
                             name="gpt_" + name,
                             max_seq_len=max_seq_len,
                             num_samples=num_samples,
                             documents=np.arange(splits[index],
                                                 splits[index + 1]),
                             sample_ids=sample_ids,
                             sample_lens=sample_lens,
                             eos_id=eos_id,
                             seed=args.seed,
                             use_pure_fp16=args.use_amp
                             and args.amp_level == "O2")
        batch_sampler = DistributedBatchSampler(
            dataset,
            batch_size=args.micro_batch_size,
            num_replicas=data_world_size,
            rank=data_world_rank,
            shuffle=False,
            drop_last=True)

        if pipeline_mode:

            def data_gen():
                for data in dataset:
                    yield tuple(
                        [np.expand_dims(np.array(x), axis=0) for x in data])

            data_loader = paddle.fluid.io.DataLoader.from_generator(
                feed_list=data_holders, capacity=70, iterable=False)
            data_loader.set_batch_generator(data_gen, places)
        else:
            data_loader = DataLoader(dataset=dataset,
                                     places=places,
                                     feed_list=data_holders,
                                     batch_sampler=batch_sampler,
                                     num_workers=1,
                                     worker_init_fn=worker_init,
                                     collate_fn=Tuple(Stack(), Stack(),
                                                      Stack(), Stack()),
                                     return_list=False)
        return data_loader
Esempio n. 28
0
def predict(model, data, label_map, batch_size=1, pad_token_id=0):
    """
    Predicts the data labels.

    Args:
        model (obj:`paddle.nn.Layer`): A model to classify texts.
        data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object.
            A Example object contains `text`(word_ids) and `seq_len`(sequence length).
        label_map(obj:`dict`): The label id (key) to label str (value) map.
        batch_size(obj:`int`, defaults to 1): The number of batch.
        pad_token_id(obj:`int`, optional, defaults to 0): The pad token index.

    Returns:
        results(obj:`dict`): All the predictions labels.
    """

    # Seperates data into some batches.
    batches = [
        data[idx:idx + batch_size] for idx in range(0, len(data), batch_size)
    ]

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=pad_token_id),  # query_ids
        Pad(axis=0, pad_val=pad_token_id),  # title_ids
        Stack(dtype="int64"),  # query_seq_lens
        Stack(dtype="int64"),  # title_seq_lens
    ): [data for data in fn(samples)]

    results = []
    model.eval()
    for batch in batches:
        query_ids, title_ids, query_seq_lens, title_seq_lens = batchify_fn(
            batch)
        query_ids = paddle.to_tensor(query_ids)
        title_ids = paddle.to_tensor(title_ids)
        query_seq_lens = paddle.to_tensor(query_seq_lens)
        title_seq_lens = paddle.to_tensor(title_seq_lens)
        logits = model(query_ids, title_ids, query_seq_lens, title_seq_lens)
        probs = F.softmax(logits, axis=1)
        idx = paddle.argmax(probs, axis=1).numpy()
        idx = idx.tolist()
        labels = [label_map[i] for i in idx]
        results.extend(labels)
    return results
Esempio n. 29
0
def create_eval_dataset(args):
    val_dataloader = None
    eval_batch_size = args.batch_size
    seq_len = args.seq_length

    tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
    pad_token = tokenizer.command_name_map["pad"].Id

    if not args.cloze_eval:
        with open(args.eval_path, "rb") as reader:
            entire_data = reader.read().decode('utf-8')
        num_original_tokens = len(entire_data.strip().split(" "))
        entire_data = wikitext_detokenizer(entire_data)
        tokenized_data = tokenizer.encode(entire_data)
        num_tokenized_tokens = len(tokenized_data)
        print('Original Tokens: %d, Detokenized tokens: %d' %
              (num_tokenized_tokens, num_original_tokens))
        val_dataset = LM_Eval_Dataset(tokenized_data, seq_len, pad_token,
                                      args.overlapping_eval)
    else:
        tokenized_data = []
        tokenized_label = []
        with open(args.eval_path, 'r') as f:
            for line in f.readlines():
                text = json.loads(line)['text']
                tokens, labels = get_tokens(tokenizer, text)
                tokenized_data.append(tokens)
                tokenized_label.append(labels)
        val_dataset = Lambada_Eval_Dataset(tokenized_data, tokenized_label,
                                           seq_len, pad_token)
        num_tokenized_tokens = 0
        num_original_tokens = 0

    args.num_examples = len(val_dataset)
    args.num_original_tokens = num_original_tokens
    args.num_tokenized_tokens = num_tokenized_tokens
    val_dataloader = DataLoader(val_dataset,
                                batch_size=eval_batch_size,
                                drop_last=False,
                                collate_fn=Tuple(Stack(), Stack(), Stack(),
                                                 Stack(), Stack()))

    return val_dataloader
Esempio n. 30
0
 def batchify_fn(
         samples,
         fn=Tuple(
             Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
             Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
             Stack(dtype="float32"))):  # label
     new_samples = []
     for sample in samples:
         new_samples.extend(sample)
     return fn(new_samples)