def create_dataloader(dataset, trans_fn=None, mode='train', batch_size=1, pad_token_id=0): """ Creats dataloader. Args: dataset(obj:`paddle.io.Dataset`): Dataset instance. mode(obj:`str`, optional, defaults to obj:`train`): If mode is 'train', it will shuffle the dataset randomly. batch_size(obj:`int`, optional, defaults to 1): The sample number of a mini-batch. pad_token_id(obj:`int`, optional, defaults to 0): The pad token index. Returns: dataloader(obj:`paddle.io.DataLoader`): The dataloader which generates batches. """ if trans_fn: dataset = dataset.map(trans_fn, lazy=True) shuffle = True if mode == 'train' else False sampler = paddle.io.BatchSampler( dataset=dataset, batch_size=batch_size, shuffle=shuffle) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=vocab.get('[PAD]', 0)), # input_ids Stack(dtype="int32"), # seq len Stack(dtype="int64") # label ): [data for data in fn(samples)] dataloader = paddle.io.DataLoader( dataset, batch_sampler=sampler, return_list=True, collate_fn=batchify_fn) return dataloader
def prepare_train_input(insts, bos_idx, eos_idx, pad_idx, pad_seq=1, dtype="int64"): """ Put all padded data needed by training into a list. """ word_pad = Pad(pad_idx, dtype=dtype) src_max_len = (max([len(inst[0]) for inst in insts]) + pad_seq) // pad_seq * pad_seq trg_max_len = (max([len(inst[1]) for inst in insts]) + pad_seq) // pad_seq * pad_seq src_word = word_pad([ inst[0] + [eos_idx] + [pad_idx] * (src_max_len - 1 - len(inst[0])) for inst in insts ]) trg_word = word_pad([[bos_idx] + inst[1] + [pad_idx] * (trg_max_len - 1 - len(inst[1])) for inst in insts]) lbl_word = np.expand_dims(word_pad([ inst[1] + [eos_idx] + [pad_idx] * (trg_max_len - 1 - len(inst[1])) for inst in insts ]), axis=2) data_inputs = [src_word, trg_word, lbl_word] return data_inputs
def init_lstm_var(args): vocab = Vocab.load_vocabulary(args.vocab_path, unk_token='[UNK]', pad_token='[PAD]') tokenizer = CharTokenizer(vocab, args.language, '../../punctuations') padding_idx = vocab.token_to_idx.get('[PAD]', 0) trans_fn = partial(convert_example, tokenizer=tokenizer, is_test=True, language=args.language) # Init attention layer lstm_hidden_size = 196 attention = SelfInteractiveAttention(hidden_size=2 * lstm_hidden_size) model = BiLSTMAttentionModel(attention_layer=attention, vocab_size=len(tokenizer.vocab), lstm_hidden_size=lstm_hidden_size, num_classes=2, padding_idx=padding_idx) # Reads data and generates mini-batches. dev_ds = Senti_data().read(args.data_dir) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=padding_idx), # input_ids Stack(dtype="int64"), # seq len ): [data for data in fn(samples)] dev_loader = create_dataloader(dev_ds, trans_fn=trans_fn, batch_size=args.batch_size, mode='validation', batchify_fn=batchify_fn) return model, tokenizer, dev_loader
def __init__(self, args={}): super(TransformerReader, self).__init__() dataset = load_dataset('wmt14ende', splits=('test')) if not args.benchmark: self.vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"]) else: self.vocab = Vocab.load_vocabulary( **dataset.vocab_info["benchmark"]) self.src_vocab = self.trg_vocab = self.vocab def convert_samples(samples): source = [] for sample in samples: src = sample.split() source.append(self.src_vocab.to_indices(src)) return source self.tokenize = convert_samples self.to_tokens = self.trg_vocab.to_tokens self.feed_keys = ["src_word"] self.bos_idx = args.bos_idx self.eos_idx = args.eos_idx self.pad_idx = args.bos_idx self.pad_seq = args.pad_seq self.word_pad = Pad(self.pad_idx)
def predict(self, data, tokenizer, batch_size=1, threshold=0.5): """ Predicts the data labels. Args: data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object. A Example object contains `text`(word_ids) and `se_len`(sequence length). tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. batch_size(obj:`int`, defaults to 1): The number of batch. threshold(obj:`int`, defaults to 0.5): The threshold for converting probabilities to labels. Returns: results(obj:`dict`): All the predictions labels. """ examples = [] for text in data: example = {"text": text} input_ids, segment_ids = convert_example( example, tokenizer, max_seq_length=self.max_seq_length, is_test=True) examples.append((input_ids, segment_ids)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment ): fn(samples) # Seperates data into some batches. batches = [ examples[idx:idx + batch_size] for idx in range(0, len(examples), batch_size) ] results = [] for batch in batches: input_ids, segment_ids = batchify_fn(batch) self.input_handles[0].copy_from_cpu(input_ids) self.input_handles[1].copy_from_cpu(segment_ids) self.predictor.run() logits = paddle.to_tensor(self.output_handle.copy_to_cpu()) probs = F.sigmoid(logits) preds = (probs.numpy() > threshold).astype(int) results.extend(preds) return results
def predict_cls(args, ext_results): # load dict model_name = "skep_ernie_1.0_large_ch" cls_label2id, cls_id2label = load_dict(args.cls_label_path) tokenizer = SkepTokenizer.from_pretrained(model_name) test_ds = MapDataset(ext_results) trans_func = partial(convert_example_to_feature_cls, tokenizer=tokenizer, label2id=cls_label2id, max_seq_len=args.cls_max_seq_len, is_test=True) test_ds = test_ds.map(trans_func, lazy=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), Pad(axis=0, pad_val=tokenizer.pad_token_type_id), Stack(dtype="int64") ): fn(samples) # set shuffle is False test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False) test_loader = paddle.io.DataLoader(test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn) print("test data loaded.") # load cls model cls_state_dict = paddle.load(args.cls_model_path) cls_model = SkepForSequenceClassification.from_pretrained( model_name, num_classes=len(cls_label2id)) cls_model.load_dict(cls_state_dict) print("classification model loaded.") cls_model.eval() results = [] for bid, batch_data in enumerate(test_loader): input_ids, token_type_ids, seq_lens = batch_data logits = cls_model(input_ids, token_type_ids=token_type_ids) predictions = logits.argmax(axis=1).numpy().tolist() results.extend(predictions) results = [cls_id2label[pred_id] for pred_id in results] return results
def infer(args): paddle.set_device(args.device) # create dataset. infer_dataset = LacDataset(args.data_dir, mode='infer') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0, dtype='int64'), # word_ids Stack(dtype='int64'), # length ): fn(samples) # Create sampler for dataloader infer_sampler = paddle.io.BatchSampler( dataset=infer_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False) infer_loader = paddle.io.DataLoader( dataset=infer_dataset, batch_sampler=infer_sampler, return_list=True, collate_fn=batchify_fn) # Define the model network network = BiGruCrf(args.emb_dim, args.hidden_size, infer_dataset.vocab_size, infer_dataset.num_labels) inputs = InputSpec(shape=(-1, ), dtype="int64", name='inputs') lengths = InputSpec(shape=(-1, ), dtype="int64", name='lengths') model = paddle.Model(network, inputs=[inputs, lengths]) model.prepare() # Load the model and start predicting model.load(args.init_checkpoint) emissions, lengths, crf_decodes = model.predict( test_data=infer_loader, batch_size=args.batch_size) # Post-processing the lexical analysis results lengths = np.array([l for lens in lengths for l in lens]).reshape([-1]) preds = np.array( [pred for batch_pred in crf_decodes for pred in batch_pred]) results = parse_lac_result(infer_dataset.word_ids, preds, lengths, infer_dataset.word_vocab, infer_dataset.label_vocab) sent_tags = [] for sent, tags in results: sent_tag = ['(%s, %s)' % (ch, tag) for ch, tag in zip(sent, tags)] sent_tags.append(''.join(sent_tag)) file_path = "results.txt" with open(file_path, "w", encoding="utf8") as fout: fout.write("\n".join(sent_tags)) # Print some examples print( "The results have been saved in the file: %s, some examples are shown below: " % file_path) print("\n".join(sent_tags[:10]))
def _batchify(self, data: List[List[str]], max_seq_len: int, batch_size: int): """ Generate input batches. """ padding = False if batch_size == 1 else True pad_func = Pad(pad_val=self.tokenizer.pad_token_id, pad_right=False) def pad_mask(batch_attention_mask): batch_size = len(batch_attention_mask) max_len = max(map(len, batch_attention_mask)) attention_mask = np.ones( (batch_size, max_len, max_len), dtype='float32') * -1e9 for i, mask_data in enumerate(attention_mask): seq_len = len(batch_attention_mask[i]) mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i], dtype='float32') # In order to ensure the correct broadcasting mechanism, expand one # dimension to the second dimension (n_head of Transformer). attention_mask = np.expand_dims(attention_mask, axis=1) return attention_mask def _parse_batch(batch_examples): if padding: input_ids = pad_func( [example['input_ids'] for example in batch_examples]) token_type_ids = pad_func( [example['token_type_ids'] for example in batch_examples]) position_ids = pad_func( [example['position_ids'] for example in batch_examples]) attention_mask = pad_mask( [example['attention_mask'] for example in batch_examples]) else: input_ids = np.asarray( [example['input_ids'] for example in batch_examples]) token_type_ids = np.asarray( [example['token_type_ids'] for example in batch_examples]) position_ids = np.asarray( [example['position_ids'] for example in batch_examples]) attention_mask = np.asarray( [example['attention_mask'] for example in batch_examples]) attention_mask = np.expand_dims(attention_mask, 0) return input_ids, token_type_ids, position_ids, attention_mask examples = [] for texts in data: examples.append(self._convert_text_to_input(texts, max_seq_len)) # Seperates data into some batches. one_batch = [] for example in examples: one_batch.append(example) if len(one_batch) == batch_size: yield _parse_batch(one_batch) one_batch = [] if one_batch: yield _parse_batch(one_batch)
def prepare_train_input(insts, bos_id, eos_id, pad_id): # Add eos token id and bos token id. src = [[bos_id] + inst + [eos_id] for inst in insts] trg = [inst[:-1] for inst in insts] label = [inst[1:] for inst in insts] # Pad sequence using eos id. src, src_length = Pad(pad_val=pad_id, ret_length=True, dtype="int64")([ids for ids in src]) trg, trg_length = Pad(pad_val=pad_id, ret_length=True, dtype="int64")([ids for ids in trg]) label, _ = Pad(pad_val=pad_id, ret_length=True, dtype="int64")([ids for ids in label]) label = np.array(label) label = label.reshape((label.shape[0], label.shape[1], 1)) return src, src_length, trg, trg_length, label
def predict(self, data, tokenizer): """ Predicts the data labels. Args: data (obj:`List(str)`): The batch data whose each element is a raw text. tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Returns: results(obj:`dict`): All the predictions labels. """ batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment ): fn(samples) all_embeddings = [] examples = [] for idx, text in enumerate(tqdm(data)): input_ids, segment_ids = convert_example( text, tokenizer, max_seq_length=self.max_seq_length, pad_to_max_seq_len=True) examples.append((input_ids, segment_ids)) if (len(examples) >= 100): input_ids, segment_ids = batchify_fn(examples) self.input_handles[0].copy_from_cpu(input_ids) self.input_handles[1].copy_from_cpu(segment_ids) self.predictor.run() logits = self.output_handle.copy_to_cpu() all_embeddings.append(logits) examples = [] if (len(examples) > 0): input_ids, segment_ids = batchify_fn(examples) self.input_handles[0].copy_from_cpu(input_ids) self.input_handles[1].copy_from_cpu(segment_ids) self.predictor.run() logits = self.output_handle.copy_to_cpu() all_embeddings.append(logits) all_embeddings = np.concatenate(all_embeddings, axis=0) np.save('corpus_embedding', all_embeddings)
def prepare_infer_input(insts, bos_idx, eos_idx, pad_idx): """ Put all padded data needed by beam search decoder into a list. """ word_pad = Pad(pad_idx) src_word = word_pad([inst[0] + [eos_idx] for inst in insts]) return [src_word, ]
def predict(self, data, tokenizer): """ Predicts the data labels. Args: data (obj:`List(str)`): The batch data whose each element is a raw text. tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Returns: results(obj:`dict`): All the predictions labels. """ if args.benchmark: self.autolog.times.start() examples = [] for text in data: input_ids, segment_ids = convert_example(text, tokenizer, is_test=True) examples.append((input_ids, segment_ids)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment ): fn(samples) if args.benchmark: self.autolog.times.stamp() input_ids, segment_ids = batchify_fn(examples) self.input_handles[0].copy_from_cpu(input_ids) self.input_handles[1].copy_from_cpu(segment_ids) self.predictor.run() logits = self.output_handle.copy_to_cpu() if args.benchmark: self.autolog.times.stamp() probs = softmax(logits, axis=1) idx = np.argmax(probs, axis=1) idx = idx.tolist() if args.benchmark: self.autolog.times.end(stamp=True) return probs
def build_data_loader(args, tokenizer): """ build corpus_data_loader and text_data_loader """ id2corpus = gen_id2corpus(args.corpus_file) # conver_example function's input must be dict corpus_list = [{idx: text} for idx, text in id2corpus.items()] corpus_ds = MapDataset(corpus_list) trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # text_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # text_segment ): [data for data in fn(samples)] corpus_data_loader = create_dataloader(corpus_ds, mode='predict', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) # build text data_loader text_list, text2similar_text = gen_text_file(args.similar_text_pair_file) text_ds = MapDataset(text_list) text_data_loader = create_dataloader(text_ds, mode='predict', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) d = { "text_data_loader": text_data_loader, "corpus_data_loader": corpus_data_loader, "id2corpus": id2corpus, "text2similar_text": text2similar_text, "text_list": text_list } return d
def create_test_dataloader(args): ''' 构建测试用的dataloader Create dataset, tokenizer and dataloader. input: args: 配置文件提供的参数借口 return: test_data_loader ''' no_entity_id = 0 # 加载dataset test_ds = load_dataset('TEDTalk', splits=('test'), lazy=False) # 构建dataloader model_name_or_path = args.model_name_or_path tokenizer = ElectraTokenizer.from_pretrained(model_name_or_path) trans_func = partial(tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32' ), # segment 'seq_len': Stack(dtype='int64'), # seq_len 'labels': Pad(axis=0, pad_val=args.ignore_label, dtype='int64') # label }): fn(samples) test_ds = test_ds.map(trans_func) test_data_loader = DataLoader(dataset=test_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) return test_data_loader
def predict(self, data, tokenizer, label_map, batch_size=1, network="bilstm"): """ Predicts the data labels. Args: model (obj:`paddle.nn.Layer`): A model to classify texts. data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object. A Example object contains `text`(word_ids) and `se_len`(sequence length). tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. label_map(obj:`dict`): The label id (key) to label str (value) map. batch_size(obj:`int`, defaults to 1): The number of batch. Returns: results(obj:`dict`): All the predictions labels. """ examples = [] for text in data: input_id, seq_len = preprocess_prediction_data(text, tokenizer) examples.append((input_id, seq_len)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab.token_to_idx.get("[PAD]", 0) ), # input_id Stack() # seq_len ): fn(samples) # Seperates data into some batches. batches = [ examples[idx:idx + batch_size] for idx in range(0, len(examples), batch_size) ] results = [] for batch in batches: input_ids, seq_lens = batchify_fn(batch) self.input_handles[0].copy_from_cpu(input_ids) if network in [ "lstm", "bilstm", "gru", "bigru", "rnn", "birnn", "bilstm_attn" ]: self.input_handles[1].copy_from_cpu(seq_lens) self.predictor.run() logits = self.output_handle.copy_to_cpu() probs = softmax(logits, axis=1) print(probs) idx = np.argmax(probs, axis=1) idx = idx.tolist() labels = [label_map[i] for i in idx] results.extend(labels) return results
def pad_sequence_paddle(inputs, lens, pad_index=0): sequences = [] idx = 0 for l in lens: sequences.append(inputs[idx:idx + l]) idx += l outputs = Pad(pad_val=pad_index)(sequences) output_tensor = paddle.to_tensor(outputs) return output_tensor
def predict(model, data, label_map, batch_size=1, pad_token_id=0): """ Predicts the data labels. Args: model (obj:`paddle.nn.Layer`): A model to classify texts. data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object. A Example object contains `text`(word_ids) and `seq_len`(sequence length). label_map(obj:`dict`): The label id (key) to label str (value) map. batch_size(obj:`int`, defaults to 1): The number of batch. pad_token_id(obj:`int`, optional, defaults to 0): The pad token index. Returns: results(obj:`dict`): All the predictions labels. """ # Seperates data into some batches. batches = [ data[idx:idx + batch_size] for idx in range(0, len(data), batch_size) ] batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=pad_token_id), # query_ids Pad(axis=0, pad_val=pad_token_id), # title_ids Stack(dtype="int64"), # query_seq_lens Stack(dtype="int64"), # title_seq_lens ): [data for data in fn(samples)] results = [] model.eval() for batch in batches: query_ids, title_ids, query_seq_lens, title_seq_lens = batchify_fn( batch) query_ids = paddle.to_tensor(query_ids) title_ids = paddle.to_tensor(title_ids) query_seq_lens = paddle.to_tensor(query_seq_lens) title_seq_lens = paddle.to_tensor(title_seq_lens) logits = model(query_ids, title_ids, query_seq_lens, title_seq_lens) probs = F.softmax(logits, axis=1) idx = paddle.argmax(probs, axis=1).numpy() idx = idx.tolist() labels = [label_map[i] for i in idx] results.extend(labels) return results
def do_predict(data, model, tokenizer, viterbi_decoder, tags_to_idx, idx_to_tags, batch_size=1, summary_num=2): examples = [] for text in data: example = {"tokens": list(text)} input_ids, token_type_ids, seq_len = convert_example(example, tokenizer, args.max_seq_len, is_test=True) examples.append((input_ids, token_type_ids, seq_len)) batches = [ examples[idx:idx + batch_size] for idx in range(0, len(examples), batch_size) ] batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64' ), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64' ), # token_type_ids Stack(dtype='int64'), # seq_len ): fn(samples) all_pred_tags = [] model.eval() for batch in batches: input_ids, token_type_ids, seq_len = batchify_fn(batch) input_ids = paddle.to_tensor(input_ids) token_type_ids = paddle.to_tensor(token_type_ids) seq_len = paddle.to_tensor(seq_len) pred_tags = model(input_ids, token_type_ids, lengths=seq_len) all_pred_tags.extend(pred_tags.numpy().tolist()) results = decode(data, all_pred_tags, summary_num, idx_to_tags) return results
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) dev_ds = load_dataset(read_test, src_path=args.test_file, lazy=False) print(dev_ds[0]) pretrained_model = ppnlp.transformers.ErnieGramModel.from_pretrained( 'ernie-gram-zh') tokenizer = ppnlp.transformers.ErnieGramTokenizer.from_pretrained( 'ernie-gram-zh') trans_func_eval = partial( convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length, phase="eval") batchify_fn_eval = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # pair_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # pair_segment Stack(dtype="int64") # label ): [data for data in fn(samples)] dev_data_loader = create_dataloader( dev_ds, mode='dev', batch_size=args.batch_size, batchify_fn=batchify_fn_eval, trans_fn=trans_func_eval) model = PairwiseMatching(pretrained_model, margin=args.margin) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) metric = paddle.metric.Auc() evaluate(model, metric, dev_data_loader, "dev")
def preprocess(self, input_dicts, data_id, log_id): from paddlenlp.data import Stack, Tuple, Pad (_, input_dict), = input_dicts.items() print("input dict", input_dict) batch_size = len(input_dict.keys()) examples = [] for i in range(batch_size): input_ids, segment_ids = convert_example([input_dict[str(i)]], self.tokenizer) examples.append((input_ids, segment_ids)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=self.tokenizer.pad_token_id), # input Pad(axis=0, pad_val=self.tokenizer.pad_token_id), # segment ): fn(samples) input_ids, segment_ids = batchify_fn(examples) feed_dict = {} feed_dict['input_ids'] = input_ids feed_dict['token_type_ids'] = segment_ids return feed_dict, False, None, ""
def prepare_train_input(insts, pad_idx): """ Put all padded data needed by training into a list. """ word_pad = Pad(pad_idx) src_word = word_pad([inst[0] for inst in insts]) trg_word = word_pad(inst[1][:-1] for inst in insts) lbl_word = word_pad([inst[1][1:] for inst in insts]) data_inputs = [src_word, trg_word, lbl_word] return data_inputs
def defaut_collator(tokenizer, args): """ Defaut collator for sequences classification Args: tokenizer (PretrainedTokenizer): tokenizer of PretrainedModel args : data argument, need label list. Returns: batchify_fn (function): collator """ batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input_ids "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # token_type_ids "labels": Stack(dtype="int64" if args.label_list else "float32") # labels }): fn(samples) return batchify_fn
def evaluate(args): place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace() paddle.set_device("gpu" if args.use_gpu else "cpu") # create dataset. test_dataset = LacDataset(args.data_dir, mode='test') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0), # word_ids Stack(), # length Pad(axis=0, pad_val=0), # label_ids ): fn(samples) # Create sampler for dataloader test_sampler = paddle.io.BatchSampler(dataset=test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False) test_loader = paddle.io.DataLoader(dataset=test_dataset, batch_sampler=test_sampler, places=place, return_list=True, collate_fn=batchify_fn) # Define the model network and metric evaluator network = BiGruCrf(args.emb_dim, args.hidden_size, test_dataset.vocab_size, test_dataset.num_labels) inputs = InputSpec(shape=(-1, ), dtype="int16", name='inputs') lengths = InputSpec(shape=(-1, ), dtype="int16", name='lengths') model = paddle.Model(network, inputs=[inputs, lengths]) chunk_evaluator = ChunkEvaluator( label_list=test_dataset.label_vocab.keys(), suffix=True) model.prepare(None, None, chunk_evaluator) # Load the model and start predicting model.load(args.init_checkpoint) model.evaluate( eval_data=test_loader, batch_size=args.batch_size, log_freq=100, verbose=2, )
def predict(self, data, word_vocab, label_vocab, normlize_vocab, batch_size=1): """ Predicts the data labels. Args: data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object. A Example object contains `text`(word_ids) and `seq_len`(sequence length). word_vocab(obj:`dict`): The word id (key) to word str (value) map. label_vocab(obj:`dict`): The label id (key) to label str (value) map. normlize_vocab(obj:`dict`): The fullwidth char (key) to halfwidth char (value) map. batch_size(obj:`int`, defaults to 1): The number of batch. Returns: results(obj:`dict`): All the predictions labels. """ examples = [] for text in data: tokens = list(text.strip()) token_ids, length = convert_example( tokens, self.max_seq_length, word_vocab=word_vocab, normlize_vocab=normlize_vocab) examples.append((token_ids, length)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0), # input Stack(axis=0), # length ): fn(samples) batches = [ examples[idx:idx + batch_size] for idx in range(0, len(examples), batch_size) ] results = [] for batch in batches: token_ids, length = batchify_fn(batch) self.input_handles[0].copy_from_cpu(token_ids) self.input_handles[1].copy_from_cpu(length) self.predictor.run() preds = self.output_handle.copy_to_cpu() result = parse_result(token_ids, preds, length, word_vocab, label_vocab) results.extend(result) return results
def evaluate(args): place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace() paddle.set_device("gpu" if args.use_gpu else "cpu") # create dataset. test_dataset = LacDataset(args.data_dir, mode='test') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0), # word_ids Stack(), # length Pad(axis=0, pad_val=0), # label_ids ): fn(samples) # Create sampler for dataloader test_sampler = paddle.io.BatchSampler(dataset=test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) test_loader = paddle.io.DataLoader(dataset=test_dataset, batch_sampler=test_sampler, places=place, return_list=True, collate_fn=batchify_fn) # Define the model network and metric evaluator network = BiGruCrf(args.emb_dim, args.hidden_size, test_dataset.vocab_size, test_dataset.num_labels) model = paddle.Model(network) chunk_evaluator = ChunkEvaluator( int(math.ceil((test_dataset.num_labels + 1) / 2.0)), "IOB") # + 1 for SOS and EOS model.prepare(None, None, chunk_evaluator) # Load the model and start predicting model.load(args.init_checkpoint) model.evaluate( eval_data=test_loader, batch_size=args.batch_size, log_freq=100, verbose=2, )
def create_pair_loader_for_small_model(task_name, model_name, vocab_path, batch_size=64, max_seq_length=128, shuffle=True, is_test=False): """Only support QQP now.""" tokenizer = BertTokenizer.from_pretrained(model_name) dataset_class = TASK_CLASSES[task_name] train_ds, dev_ds = dataset_class.get_datasets(['train', 'dev']) vocab = Vocab.load_vocabulary( vocab_path, unk_token='[UNK]', pad_token='[PAD]', bos_token=None, eos_token=None, ) trans_func = partial(convert_pair_example, task_name=task_name, vocab=tokenizer, is_tokenized=False, max_seq_length=max_seq_length, is_test=is_test) train_ds = train_ds.apply(trans_func, lazy=True) dev_ds = dev_ds.apply(trans_func, lazy=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=vocab['[PAD]']), # input Stack(), # length Pad(axis=0, pad_val=vocab['[PAD]']), # input Stack(), # length Stack(dtype="int64" if train_ds.get_labels() else "float32") # label ): [data for i, data in enumerate(fn(samples))] train_data_loader, dev_data_loader = create_dataloader( train_ds, dev_ds, batch_size, batchify_fn, shuffle) return train_data_loader, dev_data_loader
def get_train_dataloader(tokenizer, args): splits = "train" data_dir = args.data_dir filename = os.path.join(data_dir, "cmrc2018_" + splits + ".pkl") if os.path.exists(filename): ds = load_pickle(filename) else: ds = load_dataset("cmrc2018", splits=splits) ds.map( partial(prepare_train_features_paddlenlp, tokenizer=tokenizer, args=args), batched=True, lazy=False, ) save_pickle(ds, filename) batch_sampler = BatchSampler(ds, batch_size=args.train_batch_size, shuffle=True) batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=0), "pinyin_ids": Pad(axis=0, pad_val=0), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64"), }): fn(samples) data_loader = DataLoader( dataset=ds, batch_sampler=batch_sampler, collate_fn=batchify_fn, num_workers=args.num_workers, return_list=True, ) return data_loader
def get_mnli_dev_dataloader(tokenizer, args, matched=True): if matched: split = "dev_matched" else: split = "dev_mismatched" filename = os.path.join("caches", args.task_name + f"_{split}" + ".pkl") if os.path.exists(filename): ds = load_pickle(filename) else: ds = load_dataset("glue", args.task_name, splits=split) ds.map( partial(trans_func, tokenizer=tokenizer, args=args), batched=False, lazy=False, ) save_pickle(ds, filename) batch_sampler = BatchSampler(ds, batch_size=args.train_batch_size, shuffle=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # attention_mask Pad(axis=0, pad_val=-100, dtype="int64"), # lm_labels Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # decoder_attention_mask ): fn(samples) data_loader = DataLoader( dataset=ds, batch_sampler=batch_sampler, collate_fn=batchify_fn, num_workers=args.num_workers, return_list=True, ) return data_loader
def preprocess_fn(data): examples = [] if not isinstance(data, list): data = [data] for text in data: input_ids, segment_ids = convert_example(text, tokenizer, max_seq_length=128, is_test=True) examples.append((input_ids, segment_ids)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input id Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment id ): fn(samples) input_ids, segment_ids = batchify_fn(examples) return paddle.to_tensor(input_ids, stop_gradient=False), paddle.to_tensor( segment_ids, stop_gradient=False)
def prepare_train_input(insts, bos_idx, eos_idx, pad_idx): """ Put all padded data needed by training into a list. """ word_pad = Pad(pad_idx) src_word = word_pad([inst[0] + [eos_idx] for inst in insts]) trg_word = word_pad([[bos_idx] + inst[1] for inst in insts]) lbl_word = np.expand_dims( word_pad([inst[1] + [eos_idx] for inst in insts]), axis=2) data_inputs = [src_word, trg_word, lbl_word] return data_inputs