def init_lstm_var(args): if args.language == 'ch': vocab = Vocab.load_vocabulary("../task/similarity/simnet/vocab.char", unk_token='[UNK]', pad_token='[PAD]') else: vocab = Vocab.load_vocabulary("../task/similarity/simnet/vocab_QQP", unk_token='[UNK]', pad_token='[PAD]') tokenizer = CharTokenizer(vocab, args.language, '../punctuations') model = SimNet(network='lstm', vocab_size=len(vocab), num_classes=2) dev_ds = SimilarityData().read(os.path.join(args.data_dir, 'dev')) dev_examples = preprocess_data(dev_ds.data, tokenizer, language=args.language) batches = [ dev_examples[idx:idx + args.batch_size] for idx in range(0, len(dev_examples), args.batch_size) ] batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=vocab.token_to_idx.get('[PAD]', 0)), # query_ids Pad(axis=0, pad_val=vocab.token_to_idx.get('[PAD]', 0)), # title_ids Stack(dtype="int64"), # query_seq_lens Stack(dtype="int64"), # title_seq_lens ): [data for data in fn(samples)] return model, tokenizer, batches, batchify_fn, vocab, dev_ds
def build_dataset(index, name, num_samples): dataset = GPTDataset(file_path=input_path, build_data_file=local_rank == 0, name="gpt_" + name, max_seq_len=max_seq_len, num_samples=num_samples, documents=np.arange(splits[index], splits[index + 1]), sample_ids=sample_ids, sample_lens=sample_lens, eos_id=eos_id, seed=args.seed) batch_sampler = DistributedBatchSampler( dataset, batch_size=args.local_batch_size, num_replicas=data_world_size, rank=data_world_rank, shuffle=False, drop_last=True) data_loader = DataLoader( dataset=dataset, places=places, feed_list=data_holders, batch_sampler=batch_sampler, num_workers=1, worker_init_fn=worker_init, # collate_fn=Tuple(Stack(), Stack(), Stack(), Stack(), Stack()), collate_fn=Tuple(Stack(), Stack(), Stack()), return_list=False) return data_loader
def create_dataloader(dataset, trans_fn=None, mode='train', batch_size=1, pad_token_id=0): """ Creats dataloader. Args: dataset(obj:`paddle.io.Dataset`): Dataset instance. mode(obj:`str`, optional, defaults to obj:`train`): If mode is 'train', it will shuffle the dataset randomly. batch_size(obj:`int`, optional, defaults to 1): The sample number of a mini-batch. pad_token_id(obj:`int`, optional, defaults to 0): The pad token index. Returns: dataloader(obj:`paddle.io.DataLoader`): The dataloader which generates batches. """ if trans_fn: dataset = dataset.map(trans_fn, lazy=True) shuffle = True if mode == 'train' else False sampler = paddle.io.BatchSampler(dataset=dataset, batch_size=batch_size, shuffle=shuffle) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=vocab.get('[PAD]', 0)), # input_ids Stack(dtype="int32"), # seq len Stack(dtype="int64") # label ): [data for data in fn(samples)] dataloader = paddle.io.DataLoader(dataset, batch_sampler=sampler, return_list=True, collate_fn=batchify_fn) return dataloader
def _test_impl(self, list_fn=True): if list_fn: batchify_fn = Tuple([Stack(), Pad(axis=0, pad_val=0)]) else: batchify_fn = Tuple(Stack(), Pad(axis=0, pad_val=0)) result = batchify_fn(self.input) self.check_output_equal(result[0], self.expected_result[0]) self.check_output_equal(result[1], self.expected_result[1])
def qa_collator(tokenizer, args): train_batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64") }): fn(samples) return train_batchify_fn
def interpret(model, data, label_map, batch_size=1, pad_token_id=0, vocab=None): """ Predicts the data labels. Args: model (obj:`paddle.nn.Layer`): A model to classify texts. data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object. A Example object contains `text`(word_ids) and `seq_len`(sequence length). label_map(obj:`dict`): The label id (key) to label str (value) map. batch_size(obj:`int`, defaults to 1): The number of batch. pad_token_id(obj:`int`, optional, defaults to 0): The pad token index. Returns: results(obj:`dict`): All the predictions labels. """ # Seperates data into some batches. batches = [ data[idx:idx + batch_size] for idx in range(0, len(data), batch_size) ] batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=pad_token_id), # query_ids Pad(axis=0, pad_val=pad_token_id), # title_ids Stack(dtype="int64"), # query_seq_lens Stack(dtype="int64"), # title_seq_lens ): [data for data in fn(samples)] model.eval() results = [] for batch in batches: query_ids, title_ids, query_seq_lens, title_seq_lens = batchify_fn( batch) query_ids = paddle.to_tensor(query_ids) title_ids = paddle.to_tensor(title_ids) query_seq_lens = paddle.to_tensor(query_seq_lens) title_seq_lens = paddle.to_tensor(title_seq_lens) logits, attention, _ = model.forward_interpret( query_ids, title_ids, query_seq_lens, title_seq_lens) query_att = attention[0] title_att = attention[1] model.clear_gradients() for query_id, title_id in zip(query_ids.numpy().tolist(), title_ids.numpy().tolist()): query = [vocab._idx_to_token[idx] for idx in query_id] title = [vocab._idx_to_token[idx] for idx in title_id] results.append([query_att, query, title_att, title]) print('query_att: %s' % query_att.shape) print('title_att: %s' % title_att.shape) return results
def main(): paddle.seed(42) args = parse_args() args.task_name = args.task_name.lower() args.model_type = args.model_type.lower() predictor = Predictor.create_predictor(args) model_class, tokenizer_class = MODEL_CLASSES[args.model_type] dev_ds = load_dataset('clue', args.task_name, splits='dev') if not args.use_faster_tokenizer: tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) else: trans_func = partial(convert_example, label_list=dev_ds.label_list, is_test=False) dev_ds = dev_ds.map(trans_func, lazy=True) if not args.use_faster_tokenizer: batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment Stack(dtype="int64" if dev_ds.label_list else "float32") # label ): fn(samples) outputs = predictor.predict(dev_ds, tokenizer, batchify_fn, args) else: outputs = predictor.faster_predict(dev_ds, args=args)
def init_lstm_var(args): vocab = Vocab.load_vocabulary(args.vocab_path, unk_token='[UNK]', pad_token='[PAD]') tokenizer = CharTokenizer(vocab, args.language, '../../punctuations') padding_idx = vocab.token_to_idx.get('[PAD]', 0) trans_fn = partial(convert_example, tokenizer=tokenizer, is_test=True, language=args.language) # Init attention layer lstm_hidden_size = 196 attention = SelfInteractiveAttention(hidden_size=2 * lstm_hidden_size) model = BiLSTMAttentionModel(attention_layer=attention, vocab_size=len(tokenizer.vocab), lstm_hidden_size=lstm_hidden_size, num_classes=2, padding_idx=padding_idx) # Reads data and generates mini-batches. dev_ds = Senti_data().read(args.data_dir) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=padding_idx), # input_ids Stack(dtype="int64"), # seq len ): [data for data in fn(samples)] dev_loader = create_dataloader(dev_ds, trans_fn=trans_fn, batch_size=args.batch_size, mode='validation', batchify_fn=batchify_fn) return model, tokenizer, dev_loader
def predict_cls(self, args, ext_results): test_ds = MapDataset(ext_results) trans_func = partial(convert_example_to_feature_cls, tokenizer=self.tokenizer, label2id=self.cls_label2id, max_seq_len=args.cls_max_seq_len, is_test=True) test_ds = test_ds.map(trans_func, lazy=False) batch_list = [ test_ds[idx:idx + args.batch_size] for idx in range(0, len(test_ds), args.batch_size) ] batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64"), Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id, dtype="int64"), Stack(dtype="int64")): fn(samples) results = [] for batch_data in batch_list: input_ids, token_type_ids, _ = batchify_fn(batch_data) self.cls_input_handles[0].copy_from_cpu(input_ids) self.cls_input_handles[1].copy_from_cpu(token_type_ids) self.cls_predictor.run() logits = self.cls_output_hanle.copy_to_cpu() predictions = logits.argmax(axis=1).tolist() results.extend(predictions) return results
def main(): args = parse_args() predictor = Predictor.create_predictor(args) args.task_name = args.task_name.lower() dataset_class, metric_class = TASK_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] dataset = dataset_class.get_datasets("test") tokenizer = tokenizer_class.from_pretrained( os.path.dirname(args.model_path)) transform_fn = partial(convert_example, tokenizer=tokenizer, label_list=dataset.get_labels(), max_seq_length=args.max_seq_length, is_test=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment Stack(), # length ): [data for i, data in enumerate(fn(samples)) if i != 2] dataset = dataset.apply(transform_fn) predictor.predict(dataset, batch_size=args.batch_size, collate_fn=batchify_fn)
def _collate_data(data, stack_fn=Stack()): num_fields = len(data[0]) out = [None] * num_fields # input_ids, segment_ids, input_mask, masked_lm_positions, # masked_lm_labels, next_sentence_labels, mask_token_num for i in (0, 1, 2, 5): out[i] = stack_fn([x[i] for x in data]) batch_size, seq_length = out[0].shape size = num_mask = sum(len(x[3]) for x in data) # Padding for divisibility by 8 for fp16 or int8 usage if size % 8 != 0: size += 8 - (size % 8) # masked_lm_positions # Organize as a 1D tensor for gather or use gather_nd out[3] = np.full(size, 0, dtype=np.int32) # masked_lm_labels out[4] = np.full([size, 1], -1, dtype=np.int64) mask_token_num = 0 for i, x in enumerate(data): for j, pos in enumerate(x[3]): out[3][mask_token_num] = i * seq_length + pos out[4][mask_token_num] = x[4][j] mask_token_num += 1 # mask_token_num out.append(np.asarray([mask_token_num], dtype=np.float32)) if args.use_amp and args.use_pure_fp16: # cast input_mask to fp16 out[2] = out[2].astype(np.float16) # cast masked_lm_scale to fp16 out[-1] = out[-1].astype(np.float16) return out
def news_title_classification(): """文本分类---新闻标题分类""" data = json.loads(request.data) if isinstance(data["texts"], str): data["texts"] = [data["texts"]] if isinstance(data["texts"], list): datasets = [] for text in data["texts"]: datasets.append({"text": text}) datasets = MapDataset(datasets) trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=64) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64") # label ): fn(samples) data_loader = create_dataloader(datasets, mode="test", batch_size=32, batchify_fn=batchify_fn, trans_fn=trans_func) labels = inference(model, data_loader) labels_text = [] for id, label in enumerate(labels): labels_text.append(labels_info[str(label)]) return jsonify(status="Success", results=labels_text) else: return jsonify(status="Failure", message="Incorrect parameter data type.")
def _collate_data(data, stack_fn=Stack()): # Data Fields: input_ids, segment_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights, next_sentence_labels num_fields = len(data[0]) out = [None] * num_fields for i in [0, 1, 5]: out[i] = stack_fn([x[i] for x in data]) batch_size, seq_length = out[0].shape size = num_mask = sum(len(x[2]) for x in data) out[2] = np.full(size, 0, dtype=np.int32) # masked_lm_labels out[3] = np.full([size, 1], -1, dtype=np.int64) # masked weight out[4] = np.full([size], 0, dtype="float32") # # Organize as a 1D tensor for gather or use gather_nd mask_token_num = 0 for i, x in enumerate(data): for j, pos in enumerate(x[2]): out[2][mask_token_num] = i * seq_length + pos out[3][mask_token_num] = x[3][j] out[4][mask_token_num] = x[4][j] mask_token_num += 1 out.append(np.asarray([mask_token_num], dtype=np.float32)) seq_len = len(out[0][0]) rand_mask_idx_list = create_bigbird_rand_mask_idx_list( config["num_layers"], seq_len, seq_len, config["nhead"], config["block_size"], config["window_size"], config["num_global_blocks"], config["num_rand_blocks"], config["seed"]) out.extend(rand_mask_idx_list) return out
def init_roberta_var(args): if args.language == 'ch': tokenizer = RobertaTokenizer.from_pretrained(args.from_pretrained) else: tokenizer = RobertaBPETokenizer.from_pretrained(args.from_pretrained) model = RobertaForQuestionAnswering.from_pretrained( args.from_pretrained, num_classes=args.num_classes) map_fn = partial(map_fn_DuCheckList, args=args, tokenizer=tokenizer) dev_ds = RCInterpret().read(args.data_dir) dev_ds.map(map_fn, batched=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "offset_mapping": Pad(axis=0, pad_val=tokenizer.pad_token_id), "overflow_to_sample": Stack(dtype='int32'), }): fn(samples) dev_dataloader = paddle.io.DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, return_list=True) return model, tokenizer, dev_dataloader, dev_ds
def _pre_process_text(self, input_texts, max_seq_len=128, batch_size=1): infer_data = [] max_predict_len = max_seq_len - self.summary_num - 1 short_input_texts = self._split_long_text2short_text_list( input_texts, max_predict_len) for text in short_input_texts: tokens = ["[CLS%i]" % i for i in range(1, self.summary_num)] + list(text) tokenized_input = self._tokenizer(tokens, return_length=True, is_split_into_words=True, max_seq_len=max_seq_len) infer_data.append([ tokenized_input['input_ids'], tokenized_input['token_type_ids'], tokenized_input['seq_len'] ]) infer_ds = MapDataset(infer_data) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=self._tokenizer.pad_token_id, dtype='int64' ), # input_ids Pad(axis=0, pad_val=self._tokenizer.pad_token_type_id, dtype='int64'), # token_type_ids Stack(dtype='int64'), # seq_len ): fn(samples) infer_data_loader = paddle.io.DataLoader(infer_ds, collate_fn=batchify_fn, num_workers=0, batch_size=batch_size, shuffle=False, return_list=True) return infer_data_loader, short_input_texts
def _collate_data(data, stack_fn=Stack()): num_fields = len(data[0]) out = [None] * num_fields # 0. input_ids, # 1. segment_ids, # 2. input_mask, # 3. masked_lm_positions, # 4. masked_lm_labels, # 5. next_sentence_labels for i in (0, 1, 2, 5): out[i] = stack_fn([x[i] for x in data]) out[5] = out[5].reshape([-1, 1]) batch_size, seq_length = out[0].shape size = num_mask = sum(len(x[3]) for x in data) # masked_lm_positions # Organize as a 1D tensor for gather or use gather_nd if size % 8 != 0: size += 8 - (size % 8) out[3] = np.full(size, 0, dtype=np.int32) # masked_lm_labels out[4] = np.full([size, 1], -1, dtype=np.int64) mask_token_num = 0 for i, x in enumerate(data): for j, pos in enumerate(x[3]): out[3][mask_token_num] = i * seq_length + pos out[4][mask_token_num] = x[4][j] mask_token_num += 1 return out
def do_eval(args): paddle.set_device(args.device) # Create dataset, tokenizer and dataloader. train_ds, eval_ds = load_dataset('msra_ner', splits=('train', 'test'), lazy=False) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) ignore_label = -100 batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32' ), # segment 'seq_len': Stack(dtype='int64'), 'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64') # label }): fn(samples) eval_ds = eval_ds.map(trans_func) eval_data_loader = DataLoader(dataset=eval_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) model.eval() metric.reset() for step, batch in enumerate(eval_data_loader): input_ids, token_type_ids, length, labels = batch logits = model(input_ids, token_type_ids) loss = loss_fct(logits, labels) avg_loss = paddle.mean(loss) preds = logits.argmax(axis=2) num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( length, preds, labels) metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) precision, recall, f1_score = metric.accumulate() print("eval loss: %f, precision: %f, recall: %f, f1: %f" % (avg_loss, precision, recall, f1_score))
def infer(args): paddle.set_device(args.device) # create dataset. infer_dataset = LacDataset(args.data_dir, mode='infer') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0, dtype='int64'), # word_ids Stack(dtype='int64'), # length ): fn(samples) # Create sampler for dataloader infer_sampler = paddle.io.BatchSampler( dataset=infer_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False) infer_loader = paddle.io.DataLoader( dataset=infer_dataset, batch_sampler=infer_sampler, return_list=True, collate_fn=batchify_fn) # Define the model network network = BiGruCrf(args.emb_dim, args.hidden_size, infer_dataset.vocab_size, infer_dataset.num_labels) inputs = InputSpec(shape=(-1, ), dtype="int64", name='inputs') lengths = InputSpec(shape=(-1, ), dtype="int64", name='lengths') model = paddle.Model(network, inputs=[inputs, lengths]) model.prepare() # Load the model and start predicting model.load(args.init_checkpoint) emissions, lengths, crf_decodes = model.predict( test_data=infer_loader, batch_size=args.batch_size) # Post-processing the lexical analysis results lengths = np.array([l for lens in lengths for l in lens]).reshape([-1]) preds = np.array( [pred for batch_pred in crf_decodes for pred in batch_pred]) results = parse_lac_result(infer_dataset.word_ids, preds, lengths, infer_dataset.word_vocab, infer_dataset.label_vocab) sent_tags = [] for sent, tags in results: sent_tag = ['(%s, %s)' % (ch, tag) for ch, tag in zip(sent, tags)] sent_tags.append(''.join(sent_tag)) file_path = "results.txt" with open(file_path, "w", encoding="utf8") as fout: fout.write("\n".join(sent_tags)) # Print some examples print( "The results have been saved in the file: %s, some examples are shown below: " % file_path) print("\n".join(sent_tags[:10]))
def create_pretrained_dataset(args, input_path, worker_init, worker_index, eod_id): train_data = GPT2Dataset(file_path=input_path, worker_index=worker_index, num_samples=args.batch_size * args.max_steps, eod_id=eod_id, seed=args.seed + worker_index) train_batch_sampler = paddle.io.DistributedBatchSampler( train_data, batch_size=args.batch_size, shuffle=True, drop_last=True) train_data_loader = DataLoader(dataset=train_data, batch_sampler=train_batch_sampler, num_workers=0, worker_init_fn=worker_init, collate_fn=Tuple(Stack(), Stack(), Stack(), Stack(), Stack())) return train_data_loader
def create_data_loader_for_small_model(task_name, vocab_path, model_name=None, batch_size=64, max_seq_length=128, shuffle=True): """Data loader for bi-lstm, not bert.""" if task_name == 'chnsenticorp': train_ds, dev_ds = load_dataset(task_name, splits=["train", "dev"]) else: train_ds, dev_ds = load_dataset('glue', task_name, splits=["train", "dev"]) if task_name == 'chnsenticorp': vocab = Vocab.load_vocabulary( vocab_path, unk_token='[UNK]', pad_token='[PAD]', bos_token=None, eos_token=None, ) pad_val = vocab['[PAD]'] else: vocab = BertTokenizer.from_pretrained(model_name) pad_val = vocab.pad_token_id trans_fn = partial(convert_small_example, task_name=task_name, vocab=vocab, max_seq_length=max_seq_length, is_test=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=pad_val), # input_ids Stack(dtype="int64"), # seq len Stack(dtype="int64") # label ): fn(samples) train_ds = train_ds.map(trans_fn, lazy=True) dev_ds = dev_ds.map(trans_fn, lazy=True) train_data_loader, dev_data_loader = create_dataloader( train_ds, dev_ds, batch_size, batchify_fn, shuffle) return train_data_loader, dev_data_loader
def do_predict(): paddle.set_device(args.device) tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0") label_map = load_dict(args.tag_path) id2label = {val: key for key, val in label_map.items()} model = ErnieForTokenClassification.from_pretrained("ernie-1.0", num_classes=len(label_map)) no_entity_label = "O" ignore_label = len(label_map) print("============start predict==========") if not args.init_ckpt or not os.path.isfile(args.init_ckpt): raise Exception("init checkpoints {} not exist".format(args.init_ckpt)) else: state_dict = paddle.load(args.init_ckpt) model.set_dict(state_dict) print("Loaded parameters from %s" % args.init_ckpt) # load data from predict file sentences = read_by_lines(args.predict_data) # origin data format sentences = [json.loads(sent) for sent in sentences] encoded_inputs_list = [] for sent in sentences: sent = sent["text"].replace(" ", "\002") input_ids, token_type_ids, seq_len = convert_example_to_feature([list(sent), []], tokenizer, max_seq_len=args.max_seq_len, is_test=True) encoded_inputs_list.append((input_ids, token_type_ids, seq_len)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # input_ids Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # token_type_ids Stack(dtype='int64') # sequence lens ): fn(samples) # Seperates data into some batches. batch_encoded_inputs = [encoded_inputs_list[i: i + args.batch_size] for i in range(0, len(encoded_inputs_list), args.batch_size)] results = [] model.eval() for batch in batch_encoded_inputs: input_ids, token_type_ids, seq_lens = batchify_fn(batch) input_ids = paddle.to_tensor(input_ids) token_type_ids = paddle.to_tensor(token_type_ids) logits = model(input_ids, token_type_ids) probs = F.softmax(logits, axis=-1) probs_ids = paddle.argmax(probs, -1).numpy() probs = probs.numpy() for p_list, p_ids, seq_len in zip(probs.tolist(), probs_ids.tolist(), seq_lens.tolist()): prob_one = [p_list[index][pid] for index, pid in enumerate(p_ids[1: seq_len - 1])] label_one = [id2label[pid] for pid in p_ids[1: seq_len - 1]] results.append({"probs": prob_one, "labels": label_one}) assert len(results) == len(sentences) for sent, ret in zip(sentences, results): sent["pred"] = ret sentences = [json.dumps(sent, ensure_ascii=False) for sent in sentences] write_by_lines(args.predict_save_path, sentences) print("save data {} to {}".format(len(sentences), args.predict_save_path))
def evaluate(args): paddle.set_device(args.device) # create dataset. test_ds = load_dataset(datafiles=(os.path.join(args.data_dir, 'test.tsv'))) word_vocab = load_vocab(os.path.join(args.data_dir, 'word.dic')) label_vocab = load_vocab(os.path.join(args.data_dir, 'tag.dic')) # q2b.dic is used to replace DBC case to SBC case normlize_vocab = load_vocab(os.path.join(args.data_dir, 'q2b.dic')) trans_func = partial( convert_example, max_seq_len=args.max_seq_len, word_vocab=word_vocab, label_vocab=label_vocab, normlize_vocab=normlize_vocab) test_ds.map(trans_func) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0, dtype='int64'), # word_ids Stack(dtype='int64'), # length Pad(axis=0, pad_val=0, dtype='int64'), # label_ids ): fn(samples) # Create sampler for dataloader test_sampler = paddle.io.BatchSampler( dataset=test_ds, batch_size=args.batch_size, shuffle=False, drop_last=False) test_loader = paddle.io.DataLoader( dataset=test_ds, batch_sampler=test_sampler, return_list=True, collate_fn=batchify_fn) # Define the model network and metric evaluator model = BiGruCrf(args.emb_dim, args.hidden_size, len(word_vocab), len(label_vocab)) chunk_evaluator = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True) # Load the model and start predicting model_dict = paddle.load(args.init_checkpoint) model.load_dict(model_dict) model.eval() chunk_evaluator.reset() for batch in test_loader: token_ids, length, labels = batch preds = model(token_ids, length) num_infer_chunks, num_label_chunks, num_correct_chunks = chunk_evaluator.compute( length, preds, labels) chunk_evaluator.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) precision, recall, f1_score = chunk_evaluator.accumulate() print("eval precision: %f, recall: %f, f1: %f" % (precision, recall, f1_score))
def load_squad_dataset(args): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') features_fn = prepare_train_features if args.is_training else prepare_validation_features if args.is_training: raw_dataset = load_dataset('squad', split='train') else: raw_dataset = load_dataset('squad', split='validation') column_names = raw_dataset.column_names dataset = raw_dataset.map(partial( features_fn, tokenizer=tokenizer, args=args), batched=True, remove_columns=column_names, num_proc=4) bs = args.micro_batch_size * args.grad_acc_factor * args.batches_per_step * args.num_replica args.batch_size = bs if args.is_training: train_batch_sampler = BatchSampler( dataset, batch_size=bs, shuffle=args.shuffle, drop_last=True) else: train_batch_sampler = BatchSampler( dataset, batch_size=bs, shuffle=args.shuffle, drop_last=False) if args.is_training: collate_fn = lambda samples, fn=Dict({ "input_ids": Stack(), "token_type_ids": Stack(), "position_ids": Stack(), "input_mask": Stack(), "start_positions": Stack(), "end_positions": Stack() }): fn(samples) else: collate_fn = lambda samples, fn=Dict({ "input_ids": Stack(), "token_type_ids": Stack(), "position_ids": Stack(), "input_mask": Stack()}): fn(samples) data_loader = DataLoader( dataset=dataset, batch_sampler=train_batch_sampler, collate_fn=collate_fn, return_list=True) return raw_dataset, data_loader
def predict(self, data, tokenizer, label_map, batch_size=1, network="bilstm"): """ Predicts the data labels. Args: model (obj:`paddle.nn.Layer`): A model to classify texts. data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object. A Example object contains `text`(word_ids) and `se_len`(sequence length). tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. label_map(obj:`dict`): The label id (key) to label str (value) map. batch_size(obj:`int`, defaults to 1): The number of batch. Returns: results(obj:`dict`): All the predictions labels. """ examples = [] for text in data: input_id, seq_len = preprocess_prediction_data(text, tokenizer) examples.append((input_id, seq_len)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab.token_to_idx.get("[PAD]", 0) ), # input_id Stack() # seq_len ): fn(samples) # Seperates data into some batches. batches = [ examples[idx:idx + batch_size] for idx in range(0, len(examples), batch_size) ] results = [] for batch in batches: input_ids, seq_lens = batchify_fn(batch) self.input_handles[0].copy_from_cpu(input_ids) if network in [ "lstm", "bilstm", "gru", "bigru", "rnn", "birnn", "bilstm_attn" ]: self.input_handles[1].copy_from_cpu(seq_lens) self.predictor.run() logits = self.output_handle.copy_to_cpu() probs = softmax(logits, axis=1) print(probs) idx = np.argmax(probs, axis=1) idx = idx.tolist() labels = [label_map[i] for i in idx] results.extend(labels) return results
def do_eval(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] dev_ds = load_dataset('clue', args.task_name, splits='dev') tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, label_list=dev_ds.label_list, tokenizer=tokenizer, max_seq_length=args.max_seq_length) dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64" if dev_ds.label_list else "float32") # label ): fn(samples) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 1 if dev_ds.label_list == None else len(dev_ds.label_list) model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_classes) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) metric = metric_class() best_acc = 0.0 global_step = 0 tic_train = time.time() model.eval() metric.reset() for batch in dev_data_loader: input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) correct = metric.compute(logits, labels) metric.update(correct) res = metric.accumulate() print("acc: %s\n, " % (res), end='')
def collect_data(samples, dataset, config): stack_fn = Stack(dtype="int64" if dataset.label_list else "float32") stack_fn1 = Stack() num_fields = len(samples[0]) out = [None] * num_fields out[0] = stack_fn1([x[0] for x in samples]) # input_ids out[1] = stack_fn1([x[1] for x in samples]) # token_type_ids if num_fields >= 2: out[2] = stack_fn(x[2] for x in samples) # labels seq_len = len(out[0][0]) # Construct the random attention mask for the random attention rand_mask_idx_list = create_bigbird_rand_mask_idx_list( config["num_layers"], seq_len, seq_len, config["nhead"], config["block_size"], config["window_size"], config["num_global_blocks"], config["num_rand_blocks"], config["seed"]) out.extend(rand_mask_idx_list) return out
def build_dataset(index, name, num_samples): dataset = GPTDataset(file_prefix=input_prefix, build_data_file=local_rank == 0, micro_batch_size=args.micro_batch_size, name="gpt_" + name, max_seq_len=max_seq_len, num_samples=num_samples, documents=np.arange(splits[index], splits[index + 1]), sample_ids=sample_ids, sample_lens=sample_lens, eos_id=eos_id, seed=args.seed, use_pure_fp16=args.use_amp and args.amp_level == "O2") batch_sampler = DistributedBatchSampler( dataset, batch_size=args.micro_batch_size, num_replicas=data_world_size, rank=data_world_rank, shuffle=False, drop_last=True) if pipeline_mode: def data_gen(): for data in dataset: yield tuple( [np.expand_dims(np.array(x), axis=0) for x in data]) data_loader = paddle.fluid.io.DataLoader.from_generator( feed_list=data_holders, capacity=70, iterable=False) data_loader.set_batch_generator(data_gen, places) else: data_loader = DataLoader(dataset=dataset, places=places, feed_list=data_holders, batch_sampler=batch_sampler, num_workers=1, worker_init_fn=worker_init, collate_fn=Tuple(Stack(), Stack(), Stack(), Stack()), return_list=False) return data_loader
def predict(model, data, label_map, batch_size=1, pad_token_id=0): """ Predicts the data labels. Args: model (obj:`paddle.nn.Layer`): A model to classify texts. data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object. A Example object contains `text`(word_ids) and `seq_len`(sequence length). label_map(obj:`dict`): The label id (key) to label str (value) map. batch_size(obj:`int`, defaults to 1): The number of batch. pad_token_id(obj:`int`, optional, defaults to 0): The pad token index. Returns: results(obj:`dict`): All the predictions labels. """ # Seperates data into some batches. batches = [ data[idx:idx + batch_size] for idx in range(0, len(data), batch_size) ] batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=pad_token_id), # query_ids Pad(axis=0, pad_val=pad_token_id), # title_ids Stack(dtype="int64"), # query_seq_lens Stack(dtype="int64"), # title_seq_lens ): [data for data in fn(samples)] results = [] model.eval() for batch in batches: query_ids, title_ids, query_seq_lens, title_seq_lens = batchify_fn( batch) query_ids = paddle.to_tensor(query_ids) title_ids = paddle.to_tensor(title_ids) query_seq_lens = paddle.to_tensor(query_seq_lens) title_seq_lens = paddle.to_tensor(title_seq_lens) logits = model(query_ids, title_ids, query_seq_lens, title_seq_lens) probs = F.softmax(logits, axis=1) idx = paddle.argmax(probs, axis=1).numpy() idx = idx.tolist() labels = [label_map[i] for i in idx] results.extend(labels) return results
def create_eval_dataset(args): val_dataloader = None eval_batch_size = args.batch_size seq_len = args.seq_length tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path) pad_token = tokenizer.command_name_map["pad"].Id if not args.cloze_eval: with open(args.eval_path, "rb") as reader: entire_data = reader.read().decode('utf-8') num_original_tokens = len(entire_data.strip().split(" ")) entire_data = wikitext_detokenizer(entire_data) tokenized_data = tokenizer.encode(entire_data) num_tokenized_tokens = len(tokenized_data) print('Original Tokens: %d, Detokenized tokens: %d' % (num_tokenized_tokens, num_original_tokens)) val_dataset = LM_Eval_Dataset(tokenized_data, seq_len, pad_token, args.overlapping_eval) else: tokenized_data = [] tokenized_label = [] with open(args.eval_path, 'r') as f: for line in f.readlines(): text = json.loads(line)['text'] tokens, labels = get_tokens(tokenizer, text) tokenized_data.append(tokens) tokenized_label.append(labels) val_dataset = Lambada_Eval_Dataset(tokenized_data, tokenized_label, seq_len, pad_token) num_tokenized_tokens = 0 num_original_tokens = 0 args.num_examples = len(val_dataset) args.num_original_tokens = num_original_tokens args.num_tokenized_tokens = num_tokenized_tokens val_dataloader = DataLoader(val_dataset, batch_size=eval_batch_size, drop_last=False, collate_fn=Tuple(Stack(), Stack(), Stack(), Stack(), Stack())) return val_dataloader
def batchify_fn( samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="float32"))): # label new_samples = [] for sample in samples: new_samples.extend(sample) return fn(new_samples)