def predict(model, data, tokenizer, label_map, batch_size=1): """ Predicts the data labels. Args: model (obj:`paddle.nn.Layer`): A model to classify texts. data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object. A Example object contains `text`(word_ids) and `seq_len`(sequence length). tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. label_map(obj:`dict`): The label id (key) to label str (value) map. batch_size(obj:`int`, defaults to 1): The number of batch. Returns: results(obj:`dict`): All the predictions labels. """ examples = [] for text in data: input_ids, segment_ids = convert_example( text, tokenizer, label_list=label_map.values(), max_seq_length=args.max_seq_length, is_test=True) examples.append((input_ids, segment_ids)) # Seperates data into some batches. batches = [ examples[idx:idx + batch_size] for idx in range(0, len(examples), batch_size) ] batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment ): fn(samples) results = [] model.eval() for batch in batches: input_ids, segment_ids = batchify_fn(batch) input_ids = paddle.to_tensor(input_ids) segment_ids = paddle.to_tensor(segment_ids) logits = model(input_ids, segment_ids) probs = F.softmax(logits, axis=1) idx = paddle.argmax(probs, axis=1).numpy() idx = idx.tolist() labels = [label_map[i] for i in idx] results.extend(labels) return results
def build_dataset(index, name, num_samples): dataset = GPTDataset(file_prefix=input_prefix, build_data_file=local_rank == 0, micro_batch_size=args.micro_batch_size, name="gpt_" + name, max_seq_len=max_seq_len, num_samples=num_samples, documents=np.arange(splits[index], splits[index + 1]), sample_ids=sample_ids, sample_lens=sample_lens, eos_id=eos_id, seed=args.seed, use_pure_fp16=args.use_amp and args.amp_level == "O2", data_holders=data_holders) batch_sampler = DistributedBatchSampler( dataset, batch_size=args.micro_batch_size, num_replicas=data_world_size, rank=data_world_rank, shuffle=False, drop_last=True) if pipeline_mode: def data_gen(): for data in dataset: yield tuple( [np.expand_dims(np.array(x), axis=0) for x in data]) data_loader = paddle.fluid.io.DataLoader.from_generator( feed_list=data_holders, capacity=70, iterable=False) data_loader.set_sample_generator(data_gen, batch_size=args.micro_batch_size, places=places) else: stacks = (Stack(), ) * len(data_holders) collate_fn = Tuple(*stacks) data_loader = DataLoader(dataset=dataset, places=places, feed_list=data_holders, batch_sampler=batch_sampler, num_workers=1, worker_init_fn=worker_init, collate_fn=collate_fn, return_list=False) return data_loader
def predict(self, data, tokenizer): """ Predicts the data labels. Args: data (obj:`List(str)`): The batch data whose each element is a raw text. tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Returns: results(obj:`dict`): All the predictions labels. """ if args.benchmark: self.autolog.times.start() examples = [] for text in data: input_ids, segment_ids = convert_example(text, tokenizer, is_test=True) examples.append((input_ids, segment_ids)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # input Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # segment ): fn(samples) if args.benchmark: self.autolog.times.stamp() input_ids, segment_ids = batchify_fn(examples) self.input_handles[0].copy_from_cpu(input_ids) self.input_handles[1].copy_from_cpu(segment_ids) self.predictor.run() logits = self.output_handle.copy_to_cpu() if args.benchmark: self.autolog.times.stamp() probs = softmax(logits, axis=1) idx = np.argmax(probs, axis=1) idx = idx.tolist() if args.benchmark: self.autolog.times.end(stamp=True) return probs
def create_data_loader(args): # Create dataset. train_ds, test_ds = load_dataset( datafiles=(os.path.join(args.data_dir, 'train.tsv'), os.path.join(args.data_dir, 'test.tsv'))) word_vocab = load_vocab(os.path.join(args.data_dir, 'word.dic')) label_vocab = load_vocab(os.path.join(args.data_dir, 'tag.dic')) # q2b.dic is used to replace DBC case to SBC case normlize_vocab = load_vocab(os.path.join(args.data_dir, 'q2b.dic')) trans_func = partial(convert_example, max_seq_len=args.max_seq_len, word_vocab=word_vocab, label_vocab=label_vocab, normlize_vocab=normlize_vocab) train_ds.map(trans_func) test_ds.map(trans_func) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=word_vocab.get("[PAD]", 0), dtype='int64' ), # word_ids Stack(dtype='int64'), # length Pad(axis=0, pad_val=label_vocab.get("O", 0), dtype='int64' ), # label_ids ): fn(samples) # Create sampler for dataloader train_sampler = paddle.io.DistributedBatchSampler( dataset=train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_loader = paddle.io.DataLoader(dataset=train_ds, batch_sampler=train_sampler, return_list=True, collate_fn=batchify_fn) test_sampler = paddle.io.BatchSampler(dataset=test_ds, batch_size=args.batch_size, shuffle=False, drop_last=False) test_loader = paddle.io.DataLoader(dataset=test_ds, batch_sampler=test_sampler, return_list=True, collate_fn=batchify_fn) return word_vocab, label_vocab, train_loader, test_loader
def predict(self, data, tokenizer, batch_size=1, threshold=0.5): """ Predicts the data labels. Args: data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object. A Example object contains `text`(word_ids) and `se_len`(sequence length). tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. batch_size(obj:`int`, defaults to 1): The number of batch. threshold(obj:`int`, defaults to 0.5): The threshold for converting probabilities to labels. Returns: results(obj:`dict`): All the predictions labels. """ examples = [] for text in data: example = {"text": text} input_ids, segment_ids = convert_example( example, tokenizer, max_seq_length=self.max_seq_length, is_test=True) examples.append((input_ids, segment_ids)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment ): fn(samples) # Seperates data into some batches. batches = [ examples[idx:idx + batch_size] for idx in range(0, len(examples), batch_size) ] results = [] for batch in batches: input_ids, segment_ids = batchify_fn(batch) self.input_handles[0].copy_from_cpu(input_ids) self.input_handles[1].copy_from_cpu(segment_ids) self.predictor.run() logits = paddle.to_tensor(self.output_handle.copy_to_cpu()) probs = F.sigmoid(logits) preds = (probs.numpy() > threshold).astype(int) results.extend(preds) return results
def predict_cls(args, ext_results): # load dict model_name = "skep_ernie_1.0_large_ch" cls_label2id, cls_id2label = load_dict(args.cls_label_path) tokenizer = SkepTokenizer.from_pretrained(model_name) test_ds = MapDataset(ext_results) trans_func = partial(convert_example_to_feature_cls, tokenizer=tokenizer, label2id=cls_label2id, max_seq_len=args.cls_max_seq_len, is_test=True) test_ds = test_ds.map(trans_func, lazy=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), Pad(axis=0, pad_val=tokenizer.pad_token_type_id), Stack(dtype="int64") ): fn(samples) # set shuffle is False test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False) test_loader = paddle.io.DataLoader(test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn) print("test data loaded.") # load cls model cls_state_dict = paddle.load(args.cls_model_path) cls_model = SkepForSequenceClassification.from_pretrained( model_name, num_classes=len(cls_label2id)) cls_model.load_dict(cls_state_dict) print("classification model loaded.") cls_model.eval() results = [] for bid, batch_data in enumerate(test_loader): input_ids, token_type_ids, seq_lens = batch_data logits = cls_model(input_ids, token_type_ids=token_type_ids) predictions = logits.argmax(axis=1).numpy().tolist() results.extend(predictions) results = [cls_id2label[pred_id] for pred_id in results] return results
def predict(self, data, tokenizer): """ Predicts the data labels. Args: data (obj:`List(str)`): The batch data whose each element is a raw text. tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Returns: results(obj:`dict`): All the predictions labels. """ batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment ): fn(samples) all_embeddings = [] examples = [] for idx, text in enumerate(tqdm(data)): input_ids, segment_ids = convert_example( text, tokenizer, max_seq_length=self.max_seq_length, pad_to_max_seq_len=True) examples.append((input_ids, segment_ids)) if (len(examples) >= 100): input_ids, segment_ids = batchify_fn(examples) self.input_handles[0].copy_from_cpu(input_ids) self.input_handles[1].copy_from_cpu(segment_ids) self.predictor.run() logits = self.output_handle.copy_to_cpu() all_embeddings.append(logits) examples = [] if (len(examples) > 0): input_ids, segment_ids = batchify_fn(examples) self.input_handles[0].copy_from_cpu(input_ids) self.input_handles[1].copy_from_cpu(segment_ids) self.predictor.run() logits = self.output_handle.copy_to_cpu() all_embeddings.append(logits) all_embeddings = np.concatenate(all_embeddings, axis=0) np.save('corpus_embedding', all_embeddings)
def create_pretrained_dataset(args, input_path, worker_init, worker_index, eod_id): train_data = GPT2Dataset(file_path=input_path, worker_index=worker_index, num_samples=args.batch_size * args.max_steps, eod_id=eod_id, seed=args.seed + worker_index) train_batch_sampler = paddle.io.DistributedBatchSampler( train_data, batch_size=args.batch_size, shuffle=True, drop_last=True) train_data_loader = DataLoader(dataset=train_data, batch_sampler=train_batch_sampler, num_workers=0, worker_init_fn=worker_init, collate_fn=Tuple(Stack(), Stack(), Stack(), Stack(), Stack())) return train_data_loader
def create_data_loader_for_small_model(task_name, vocab_path, model_name=None, batch_size=64, max_seq_length=128, shuffle=True): """Data loader for bi-lstm, not bert.""" if task_name == 'chnsenticorp': train_ds, dev_ds = load_dataset(task_name, splits=["train", "dev"]) else: train_ds, dev_ds = load_dataset('glue', task_name, splits=["train", "dev"]) if task_name == 'chnsenticorp': vocab = Vocab.load_vocabulary( vocab_path, unk_token='[UNK]', pad_token='[PAD]', bos_token=None, eos_token=None, ) pad_val = vocab['[PAD]'] else: vocab = BertTokenizer.from_pretrained(model_name) pad_val = vocab.pad_token_id trans_fn = partial(convert_small_example, task_name=task_name, vocab=vocab, max_seq_length=max_seq_length, is_test=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=pad_val), # input_ids Stack(dtype="int64"), # seq len Stack(dtype="int64") # label ): fn(samples) train_ds = train_ds.map(trans_fn, lazy=True) dev_ds = dev_ds.map(trans_fn, lazy=True) train_data_loader, dev_data_loader = create_dataloader( train_ds, dev_ds, batch_size, batchify_fn, shuffle) return train_data_loader, dev_data_loader
def build_data_loader(args, tokenizer): """ build corpus_data_loader and text_data_loader """ id2corpus = gen_id2corpus(args.corpus_file) # conver_example function's input must be dict corpus_list = [{idx: text} for idx, text in id2corpus.items()] corpus_ds = MapDataset(corpus_list) trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # text_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # text_segment ): [data for data in fn(samples)] corpus_data_loader = create_dataloader(corpus_ds, mode='predict', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) # build text data_loader text_list, text2similar_text = gen_text_file(args.similar_text_pair_file) text_ds = MapDataset(text_list) text_data_loader = create_dataloader(text_ds, mode='predict', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) d = { "text_data_loader": text_data_loader, "corpus_data_loader": corpus_data_loader, "id2corpus": id2corpus, "text2similar_text": text2similar_text, "text_list": text_list } return d
def init_lstm_var(args): #different language has different tokenizer if args.language == "ch": tokenizer = ErnieTokenizer.from_pretrained(args.vocab_path) padding_idx = tokenizer.vocab.get('[PAD]') tokenizer.inverse_vocab = [ item[0] for item in sorted(tokenizer.vocab.items(), key=lambda x: x[1]) ] else: vocab = Vocab.load_vocabulary(args.vocab_path, unk_token='[UNK]', pad_token='[PAD]') tokenizer = CharTokenizer(vocab) padding_idx = vocab.token_to_idx.get('[PAD]', 0) trans_fn = partial(convert_example, tokenizer=tokenizer, is_test=True, language=args.language) #init attention layer lstm_hidden_size = 196 attention = SelfInteractiveAttention(hidden_size=2 * lstm_hidden_size) model = BiLSTMAttentionModel(attention_layer=attention, vocab_size=len(tokenizer.vocab), lstm_hidden_size=lstm_hidden_size, num_classes=2, padding_idx=padding_idx) # Reads data and generates mini-batches. dev_ds = SentiData().read(os.path.join(args.data_dir, 'dev'), args.language) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=padding_idx), # input_ids Stack(dtype="int64"), # seq len ): [data for data in fn(samples)] dev_loader = create_dataloader(dev_ds, trans_fn=trans_fn, batch_size=args.batch_size, mode='validation', batchify_fn=batchify_fn) return model, tokenizer, dev_loader
def do_predict(data, model, tokenizer, viterbi_decoder, tags_to_idx, idx_to_tags, batch_size=1, summary_num=2): examples = [] for text in data: example = {"tokens": list(text)} input_ids, token_type_ids, seq_len = convert_example(example, tokenizer, args.max_seq_len, is_test=True) examples.append((input_ids, token_type_ids, seq_len)) batches = [ examples[idx:idx + batch_size] for idx in range(0, len(examples), batch_size) ] batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64' ), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64' ), # token_type_ids Stack(dtype='int64'), # seq_len ): fn(samples) all_pred_tags = [] model.eval() for batch in batches: input_ids, token_type_ids, seq_len = batchify_fn(batch) input_ids = paddle.to_tensor(input_ids) token_type_ids = paddle.to_tensor(token_type_ids) seq_len = paddle.to_tensor(seq_len) pred_tags = model(input_ids, token_type_ids, lengths=seq_len) all_pred_tags.extend(pred_tags.numpy().tolist()) results = decode(data, all_pred_tags, summary_num, idx_to_tags) return results
def predict(model, data, label_map, batch_size=1, pad_token_id=0): """ Predicts the data labels. Args: model (obj:`paddle.nn.Layer`): A model to classify texts. data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object. A Example object contains `text`(word_ids) and `seq_len`(sequence length). label_map(obj:`dict`): The label id (key) to label str (value) map. batch_size(obj:`int`, defaults to 1): The number of batch. pad_token_id(obj:`int`, optional, defaults to 0): The pad token index. Returns: results(obj:`dict`): All the predictions labels. """ # Seperates data into some batches. batches = [ data[idx:idx + batch_size] for idx in range(0, len(data), batch_size) ] batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=pad_token_id), # query_ids Pad(axis=0, pad_val=pad_token_id), # title_ids Stack(dtype="int64"), # query_seq_lens Stack(dtype="int64"), # title_seq_lens ): [data for data in fn(samples)] results = [] model.eval() for batch in batches: query_ids, title_ids, query_seq_lens, title_seq_lens = batchify_fn( batch) query_ids = paddle.to_tensor(query_ids) title_ids = paddle.to_tensor(title_ids) query_seq_lens = paddle.to_tensor(query_seq_lens) title_seq_lens = paddle.to_tensor(title_seq_lens) logits = model(query_ids, title_ids, query_seq_lens, title_seq_lens) probs = F.softmax(logits, axis=1) idx = paddle.argmax(probs, axis=1).numpy() idx = idx.tolist() labels = [label_map[i] for i in idx] results.extend(labels) return results
def create_eval_dataset(args): val_dataloader = None eval_batch_size = args.batch_size seq_len = args.seq_length tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path) pad_token = tokenizer.command_name_map["pad"].Id if not args.cloze_eval: with open(args.eval_path, "rb") as reader: entire_data = reader.read().decode('utf-8') num_original_tokens = len(entire_data.strip().split(" ")) entire_data = wikitext_detokenizer(entire_data) tokenized_data = tokenizer.encode(entire_data) num_tokenized_tokens = len(tokenized_data) print('Original Tokens: %d, Detokenized tokens: %d' % (num_tokenized_tokens, num_original_tokens)) val_dataset = LM_Eval_Dataset(tokenized_data, seq_len, pad_token, args.overlapping_eval) else: tokenized_data = [] tokenized_label = [] with open(args.eval_path, 'r') as f: for line in f.readlines(): text = json.loads(line)['text'] tokens, labels = get_tokens(tokenizer, text) tokenized_data.append(tokens) tokenized_label.append(labels) val_dataset = Lambada_Eval_Dataset(tokenized_data, tokenized_label, seq_len, pad_token) num_tokenized_tokens = 0 num_original_tokens = 0 args.num_examples = len(val_dataset) args.num_original_tokens = num_original_tokens args.num_tokenized_tokens = num_tokenized_tokens val_dataloader = DataLoader(val_dataset, batch_size=eval_batch_size, drop_last=False, collate_fn=Tuple(Stack(), Stack(), Stack(), Stack(), Stack())) return val_dataloader
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) dev_ds = load_dataset(read_test, src_path=args.test_file, lazy=False) print(dev_ds[0]) pretrained_model = ppnlp.transformers.ErnieGramModel.from_pretrained( 'ernie-gram-zh') tokenizer = ppnlp.transformers.ErnieGramTokenizer.from_pretrained( 'ernie-gram-zh') trans_func_eval = partial( convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length, phase="eval") batchify_fn_eval = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # pair_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # pair_segment Stack(dtype="int64") # label ): [data for data in fn(samples)] dev_data_loader = create_dataloader( dev_ds, mode='dev', batch_size=args.batch_size, batchify_fn=batchify_fn_eval, trans_fn=trans_func_eval) model = PairwiseMatching(pretrained_model, margin=args.margin) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) metric = paddle.metric.Auc() evaluate(model, metric, dev_data_loader, "dev")
def create_data_loader_for_small_model(task_name, vocab_path, model_name=None, batch_size=64, max_seq_length=128, shuffle=True): """Data loader for bi-lstm, not bert.""" dataset_class = TASK_CLASSES[task_name] train_ds, dev_ds = dataset_class.get_datasets(['train', 'dev']) if task_name == 'senta': vocab = Vocab.load_vocabulary( vocab_path, unk_token='[UNK]', pad_token='[PAD]', bos_token=None, eos_token=None, ) pad_val = vocab['[PAD]'] else: vocab = BertTokenizer.from_pretrained(model_name) pad_val = vocab.pad_token_id trans_fn = partial(convert_small_example, task_name=task_name, vocab=vocab, max_seq_length=max_seq_length, is_test=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=pad_val), # input_ids Stack(dtype="int64"), # seq len Stack(dtype="int64") # label ): [data for data in fn(samples)] train_ds = train_ds.apply(trans_fn, lazy=True) dev_ds = dev_ds.apply(trans_fn, lazy=True) train_data_loader, dev_data_loader = create_dataloader( train_ds, dev_ds, batch_size, batchify_fn, shuffle) return train_data_loader, dev_data_loader
def preprocess(self, input_dicts, data_id, log_id): from paddlenlp.data import Stack, Tuple, Pad (_, input_dict), = input_dicts.items() print("input dict", input_dict) batch_size = len(input_dict.keys()) examples = [] for i in range(batch_size): input_ids, segment_ids = convert_example([input_dict[str(i)]], self.tokenizer) examples.append((input_ids, segment_ids)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=self.tokenizer.pad_token_id), # input Pad(axis=0, pad_val=self.tokenizer.pad_token_id), # segment ): fn(samples) input_ids, segment_ids = batchify_fn(examples) feed_dict = {} feed_dict['input_ids'] = input_ids feed_dict['token_type_ids'] = segment_ids return feed_dict, False, None, ""
def evaluate(args): place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace() paddle.set_device("gpu" if args.use_gpu else "cpu") # create dataset. test_dataset = LacDataset(args.data_dir, mode='test') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0), # word_ids Stack(), # length Pad(axis=0, pad_val=0), # label_ids ): fn(samples) # Create sampler for dataloader test_sampler = paddle.io.BatchSampler(dataset=test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False) test_loader = paddle.io.DataLoader(dataset=test_dataset, batch_sampler=test_sampler, places=place, return_list=True, collate_fn=batchify_fn) # Define the model network and metric evaluator network = BiGruCrf(args.emb_dim, args.hidden_size, test_dataset.vocab_size, test_dataset.num_labels) inputs = InputSpec(shape=(-1, ), dtype="int16", name='inputs') lengths = InputSpec(shape=(-1, ), dtype="int16", name='lengths') model = paddle.Model(network, inputs=[inputs, lengths]) chunk_evaluator = ChunkEvaluator( label_list=test_dataset.label_vocab.keys(), suffix=True) model.prepare(None, None, chunk_evaluator) # Load the model and start predicting model.load(args.init_checkpoint) model.evaluate( eval_data=test_loader, batch_size=args.batch_size, log_freq=100, verbose=2, )
def get_mnli_dev_dataloader(tokenizer, args, matched=True): if matched: split = "dev_matched" else: split = "dev_mismatched" filename = os.path.join("caches", args.task_name + f"_{split}" + ".pkl") if os.path.exists(filename): ds = load_pickle(filename) else: ds = load_dataset("glue", args.task_name, splits=split) ds.map( partial(trans_func, tokenizer=tokenizer, args=args), batched=False, lazy=False, ) save_pickle(ds, filename) batch_sampler = BatchSampler(ds, batch_size=args.train_batch_size, shuffle=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # attention_mask Pad(axis=0, pad_val=-100, dtype="int64"), # lm_labels Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # decoder_attention_mask ): fn(samples) data_loader = DataLoader( dataset=ds, batch_sampler=batch_sampler, collate_fn=batchify_fn, num_workers=args.num_workers, return_list=True, ) return data_loader
def create_pair_loader_for_small_model(task_name, model_name, vocab_path, batch_size=64, max_seq_length=128, shuffle=True, is_test=False): """Only support QQP now.""" tokenizer = BertTokenizer.from_pretrained(model_name) dataset_class = TASK_CLASSES[task_name] train_ds, dev_ds = dataset_class.get_datasets(['train', 'dev']) vocab = Vocab.load_vocabulary( vocab_path, unk_token='[UNK]', pad_token='[PAD]', bos_token=None, eos_token=None, ) trans_func = partial(convert_pair_example, task_name=task_name, vocab=tokenizer, is_tokenized=False, max_seq_length=max_seq_length, is_test=is_test) train_ds = train_ds.apply(trans_func, lazy=True) dev_ds = dev_ds.apply(trans_func, lazy=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=vocab['[PAD]']), # input Stack(), # length Pad(axis=0, pad_val=vocab['[PAD]']), # input Stack(), # length Stack(dtype="int64" if train_ds.get_labels() else "float32") # label ): [data for i, data in enumerate(fn(samples))] train_data_loader, dev_data_loader = create_dataloader( train_ds, dev_ds, batch_size, batchify_fn, shuffle) return train_data_loader, dev_data_loader
def evaluate(args): place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace() paddle.set_device("gpu" if args.use_gpu else "cpu") # create dataset. test_dataset = LacDataset(args.data_dir, mode='test') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0), # word_ids Stack(), # length Pad(axis=0, pad_val=0), # label_ids ): fn(samples) # Create sampler for dataloader test_sampler = paddle.io.BatchSampler(dataset=test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) test_loader = paddle.io.DataLoader(dataset=test_dataset, batch_sampler=test_sampler, places=place, return_list=True, collate_fn=batchify_fn) # Define the model network and metric evaluator network = BiGruCrf(args.emb_dim, args.hidden_size, test_dataset.vocab_size, test_dataset.num_labels) model = paddle.Model(network) chunk_evaluator = ChunkEvaluator( int(math.ceil((test_dataset.num_labels + 1) / 2.0)), "IOB") # + 1 for SOS and EOS model.prepare(None, None, chunk_evaluator) # Load the model and start predicting model.load(args.init_checkpoint) model.evaluate( eval_data=test_loader, batch_size=args.batch_size, log_freq=100, verbose=2, )
def preprocess_fn(data): examples = [] if not isinstance(data, list): data = [data] for text in data: input_ids, segment_ids = convert_example(text, tokenizer, max_seq_length=128, is_test=True) examples.append((input_ids, segment_ids)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input id Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment id ): fn(samples) input_ids, segment_ids = batchify_fn(examples) return paddle.to_tensor(input_ids, stop_gradient=False), paddle.to_tensor( segment_ids, stop_gradient=False)
def create_pair_loader_for_small_model(task_name, model_name, vocab_path, batch_size=64, max_seq_length=128, shuffle=True, is_test=False): """Only support QQP now.""" tokenizer = BertTokenizer.from_pretrained(model_name) train_ds, dev_ds = load_dataset('glue', task_name, splits=["train", "dev"]) vocab = Vocab.load_vocabulary( vocab_path, unk_token='[UNK]', pad_token='[PAD]', bos_token=None, eos_token=None, ) trans_func = partial(convert_pair_example, task_name=task_name, vocab=tokenizer, is_tokenized=False, max_seq_length=max_seq_length, is_test=is_test) train_ds = train_ds.map(trans_func, lazy=True) dev_ds = dev_ds.map(trans_func, lazy=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=vocab['[PAD]']), # input Stack(), # length Pad(axis=0, pad_val=vocab['[PAD]']), # input Stack(), # length Stack(dtype="int64" if train_ds.label_list else "float32") # label ): fn(samples) train_data_loader, dev_data_loader = create_dataloader( train_ds, dev_ds, batch_size, batchify_fn, shuffle) return train_data_loader, dev_data_loader
def predict(self, data, tokenizer, label_map): """ Predicts the data labels. Args: data (obj:`List(str)`): The batch data whose each element is a raw text. tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. label_map(obj:`dict`): The label id (key) to label str (value) map. Returns: results(obj:`dict`): All the predictions labels. """ examples = [] for text in data: example = {"text": text} input_ids, segment_ids = convert_example( example, tokenizer, max_seq_length=self.max_seq_length, is_test=True) examples.append((input_ids, segment_ids)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment ): fn(samples) input_ids, segment_ids = batchify_fn(examples) self.input_handles[0].copy_from_cpu(input_ids) self.input_handles[1].copy_from_cpu(segment_ids) self.predictor.run() logits = self.output_handle.copy_to_cpu() probs = softmax(logits, axis=1) idx = np.argmax(probs, axis=1) idx = idx.tolist() labels = [label_map[i] for i in idx] return labels
def create_pretraining_dataset(input_file, shared_list, args, worker_init, tokenizer): train_data = PretrainingDataset(input_file=input_file, tokenizer=tokenizer, max_seq_length=args.max_seq_length) # files have been sharded, no need to dispatch again train_batch_sampler = paddle.io.BatchSampler(train_data, batch_size=args.batch_size, shuffle=True) # DataLoader cannot be pickled because of its place. # If it can be pickled, use global function instead of lambda and use # ProcessPoolExecutor instead of ThreadPoolExecutor to prefetch. batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input ): fn(samples) train_data_loader = DataLoader(dataset=train_data, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, worker_init_fn=worker_init, return_list=True) return train_data_loader, input_file
def main(args): paddle.set_device('gpu' if args.n_gpu else 'cpu') world_size = dist.get_world_size() rank = dist.get_rank() if world_size > 1 and args.do_train: dist.init_parallel_env() set_seed(args.seed) dataset_class, metric_class = TASK_CLASSES[args.task_name] tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial(dataset_class.convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_len) test_trans_func = partial(dataset_class.convert_example, tokenizer=tokenizer, max_seq_length=args.test_max_seq_len) metric = metric_class() if args.task_name in ('udc', 'dstc2', 'atis_intent', 'mrda', 'swda'): batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment Stack(dtype='int64') # label ): fn(samples) model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=dataset_class.num_classes()) elif args.task_name == 'atis_slot': batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment Pad(axis=0, pad_val=0, dtype='int64') # label ): fn(samples) model = BertForTokenClassification.from_pretrained( args.model_name_or_path, num_classes=dataset_class.num_classes(), dropout=0.0) if world_size > 1 and args.do_train: model = paddle.DataParallel(model) if args.do_train: train_data_loader = create_data_loader(args, dataset_class, trans_func, batchify_fn, 'train') if args.do_eval: dev_data_loader = create_data_loader(args, dataset_class, test_trans_func, batchify_fn, 'dev') else: dev_data_loader = None train(args, model, train_data_loader, dev_data_loader, metric, rank) if args.do_test: if rank == 0: test_data_loader = create_data_loader(args, dataset_class, test_trans_func, batchify_fn, 'test') if args.do_train: # If do_eval=True, use best model to evaluate the test data. # Otherwise, use final model to evaluate the test data. if args.do_eval: args.init_from_ckpt = os.path.join(args.output_dir, 'best') load_ckpt(args, model) else: if not args.init_from_ckpt: raise ValueError('"init_from_ckpt" should be set.') load_ckpt(args, model) print('\nTest begin...') evaluation(args, model, test_data_loader, metric)
pinyin_ids = encoded_inputs["pinyin_ids"] label = np.array([example["label"]], dtype="int64") return input_ids, pinyin_ids, label # Process the data into a data format that the model can read in. trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) # Form data into batch data, such as padding text sequences of different lengths into the maximum length of batch data, # and stack each data label together batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input_ids # Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # token_type_ids Pad(axis=0, pad_val=0), # pinyin_ids Stack() # labels ): [data for data in fn(samples)] from utils import create_dataloader train_data_loader = create_dataloader(train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader(dev_ds, mode='dev', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func)
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) # Load train dataset. file_name = 'train.csv' train_ds = load_dataset(read_custom_data, filename=os.path.join(args.data_path, file_name), is_test=False, lazy=False) pretrained_model = ppnlp.transformers.BertModel.from_pretrained( "bert-base-uncased") tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained( 'bert-base-uncased') trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype='float32') # label ): [data for data in fn(samples)] train_data_loader = create_dataloader(train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) model = MultiLabelClassifier(pretrained_model, num_labels=len(train_ds.data[0]["label"])) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) metric = MultiLabelReport() criterion = paddle.nn.BCEWithLogitsLoss() global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): input_ids, token_type_ids, labels = batch logits = model(input_ids, token_type_ids) loss = criterion(logits, labels) probs = F.sigmoid(logits) metric.update(probs, labels) auc, f1_score = metric.accumulate() global_step += 1 if global_step % 10 == 0 and rank == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, auc: %.5f, f1 score: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, auc, f1_score, 10 / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % 100 == 0 and rank == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) save_param_path = os.path.join(save_dir, "model_state.pdparams") paddle.save(model.state_dict(), save_param_path) tokenizer.save_pretrained(save_dir)
def train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") # create dataset. train_dataset = LacDataset(args.data_dir, mode='train') test_dataset = LacDataset(args.data_dir, mode='test') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0), # word_ids Stack(), # length Pad(axis=0, pad_val=0), # label_ids ): fn(samples) # Create sampler for dataloader train_sampler = paddle.io.DistributedBatchSampler( dataset=train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) train_loader = paddle.io.DataLoader(dataset=train_dataset, batch_sampler=train_sampler, return_list=True, collate_fn=batchify_fn) test_sampler = paddle.io.BatchSampler(dataset=test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False) test_loader = paddle.io.DataLoader(dataset=test_dataset, batch_sampler=test_sampler, return_list=True, collate_fn=batchify_fn) # Define the model netword and its loss network = BiGruCrf(args.emb_dim, args.hidden_size, train_dataset.vocab_size, train_dataset.num_labels) model = paddle.Model(network) # Prepare optimizer, loss and metric evaluator optimizer = paddle.optimizer.Adam(learning_rate=args.base_lr, parameters=model.parameters()) crf_loss = LinearChainCrfLoss(network.crf) chunk_evaluator = ChunkEvaluator( label_list=train_dataset.label_vocab.keys(), suffix=True) model.prepare(optimizer, crf_loss, chunk_evaluator) if args.init_checkpoint: model.load(args.init_checkpoint) # Start training callbacks = paddle.callbacks.ProgBarLogger( log_freq=10, verbose=3) if args.verbose else None model.fit(train_data=train_loader, eval_data=test_loader, batch_size=args.batch_size, epochs=args.epochs, eval_freq=1, log_freq=10, save_dir=args.model_save_dir, save_freq=1, shuffle=True, callbacks=callbacks)
def do_train(args): paddle.set_device(args.select_device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() dataset_class, metric_class = TASK_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_dataset = dataset_class.get_datasets(["train"]) tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_dataset.get_labels(), max_seq_length=args.max_seq_length) train_dataset = train_dataset.apply(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment Stack(), # length Stack(dtype="int64" if train_dataset.get_labels() else "float32") # label ): [data for i, data in enumerate(fn(samples)) if i != 2] train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) if args.task_name == "mnli": dev_dataset_matched, dev_dataset_mismatched = dataset_class.get_datasets( ["dev_matched", "dev_mismatched"]) dev_dataset_matched = dev_dataset_matched.apply(trans_func, lazy=True) dev_dataset_mismatched = dev_dataset_mismatched.apply(trans_func, lazy=True) dev_batch_sampler_matched = paddle.io.BatchSampler( dev_dataset_matched, batch_size=args.batch_size, shuffle=False) dev_data_loader_matched = DataLoader( dataset=dev_dataset_matched, batch_sampler=dev_batch_sampler_matched, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_batch_sampler_mismatched = paddle.io.BatchSampler( dev_dataset_mismatched, batch_size=args.batch_size, shuffle=False) dev_data_loader_mismatched = DataLoader( dataset=dev_dataset_mismatched, batch_sampler=dev_batch_sampler_mismatched, collate_fn=batchify_fn, num_workers=0, return_list=True) else: dev_dataset = dataset_class.get_datasets(["dev"]) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 1 if train_dataset.get_labels() == None else len( train_dataset.get_labels()) model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_classes) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_train_epochs) warmup_steps = args.warmup_steps if args.warmup_steps > 0 else (int( math.floor(num_training_steps * args.warmup_proportion))) lr_scheduler = paddle.optimizer.lr.LambdaDecay( args.learning_rate, lambda current_step, num_warmup_steps=warmup_steps, num_training_steps= num_training_steps: float(current_step) / float( max(1, num_warmup_steps)) if current_step < num_warmup_steps else max( 0.0, float(num_training_steps - current_step) / float( max(1, num_training_steps - num_warmup_steps)))) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) loss_fct = paddle.nn.loss.CrossEntropyLoss() if train_dataset.get_labels( ) else paddle.nn.loss.MSELoss() metric = metric_class() global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = loss_fct(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.logging_steps == 0: logger.info( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0: tic_eval = time.time() if args.task_name == "mnli": evaluate(model, loss_fct, metric, dev_data_loader_matched) evaluate(model, loss_fct, metric, dev_data_loader_mismatched) logger.info("eval done total : %s s" % (time.time() - tic_eval)) else: evaluate(model, loss_fct, metric, dev_data_loader) logger.info("eval done total : %s s" % (time.time() - tic_eval)) if (not args.n_cards > 1 ) or paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "%s_ft_model_%d.pdparams" % (args.task_name, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)