def do_eval(args): paddle.set_device(args.device) # Create dataset, tokenizer and dataloader. train_ds, eval_ds = load_dataset('msra_ner', splits=('train', 'test'), lazy=False) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) ignore_label = -100 batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32' ), # segment 'seq_len': Stack(dtype='int64'), 'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64') # label }): fn(samples) eval_ds = eval_ds.map(trans_func) eval_data_loader = DataLoader(dataset=eval_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) model.eval() metric.reset() for step, batch in enumerate(eval_data_loader): input_ids, token_type_ids, length, labels = batch logits = model(input_ids, token_type_ids) loss = loss_fct(logits, labels) avg_loss = paddle.mean(loss) preds = logits.argmax(axis=2) num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( length, preds, labels) metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) precision, recall, f1_score = metric.accumulate() print("eval loss: %f, precision: %f, recall: %f, f1: %f" % (avg_loss, precision, recall, f1_score))
def compare_lm(path="junnyu/microsoft-DialoGPT-small"): pdmodel = PDGPT2LMHeadModel.from_pretrained(path) ptmodel = PTGPT2LMHeadModel.from_pretrained(path).cuda() if "chinese" in path: text = "欢迎使用paddlenlp!" tokenizer = BertTokenizer.from_pretrained(path) else: text = "Welcome to paddlenlp!" tokenizer = GPTTokenizer.from_pretrained(path) pdmodel.eval() ptmodel.eval() pdinputs = { k: paddle.to_tensor( v, dtype="int64").unsqueeze(0) for k, v in tokenizer( text, return_token_type_ids=False).items() } ptinputs = { k: torch.tensor( v, dtype=torch.long).unsqueeze(0).cuda() for k, v in tokenizer( text, return_token_type_ids=False).items() } pd_logits = pdmodel(**pdinputs) pt_logits = ptmodel(**ptinputs).logits compare(pd_logits, pt_logits)
def get_bert(params): model_bert = BertModel.from_pretrained("bert-base-uncased") bert_config = BertPretrainedModel.pretrained_init_configuration[ "bert-base-uncased"] tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') return model_bert, tokenizer, bert_config
def reader(): # Create the tokenizer and dataset tokenizer = BertTokenizer.from_pretrained(args.model_dir) train_ds = load_dataset('glue', args.task, splits="train") trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=128, is_test=True) train_ds = train_ds.map(trans_func, lazy=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # token_type ): fn(samples) train_batch_sampler = paddle.io.BatchSampler(train_ds, batch_size=32, shuffle=True) [input_ids, token_type_ids, labels] = create_data_holder(args.task) feed_list_name = [] train_data_loader = DataLoader(dataset=train_ds, feed_list=[input_ids, token_type_ids], batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=False) dev_trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=128) dev_batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # token_type Stack(dtype="int64" if train_ds.label_list else "float32") # label ): fn(samples) dev_ds = load_dataset('glue', args.task, splits='dev') dev_ds = dev_ds.map(dev_trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=32, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, num_workers=0, feed_list=[input_ids, token_type_ids, labels], return_list=False) return train_data_loader, dev_data_loader
def create_data_loader_for_small_model(task_name, vocab_path, model_name=None, batch_size=64, max_seq_length=128, shuffle=True): """Data loader for bi-lstm, not bert.""" if task_name == 'chnsenticorp': train_ds, dev_ds = load_dataset(task_name, splits=["train", "dev"]) else: train_ds, dev_ds = load_dataset('glue', task_name, splits=["train", "dev"]) if task_name == 'chnsenticorp': vocab = Vocab.load_vocabulary( vocab_path, unk_token='[UNK]', pad_token='[PAD]', bos_token=None, eos_token=None, ) pad_val = vocab['[PAD]'] else: vocab = BertTokenizer.from_pretrained(model_name) pad_val = vocab.pad_token_id trans_fn = partial(convert_small_example, task_name=task_name, vocab=vocab, max_seq_length=max_seq_length, is_test=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=pad_val), # input_ids Stack(dtype="int64"), # seq len Stack(dtype="int64") # label ): fn(samples) train_ds = train_ds.map(trans_fn, lazy=True) dev_ds = dev_ds.map(trans_fn, lazy=True) train_data_loader, dev_data_loader = create_dataloader( train_ds, dev_ds, batch_size, batchify_fn, shuffle) return train_data_loader, dev_data_loader
def load_squad_dataset(args): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') features_fn = prepare_train_features if args.is_training else prepare_validation_features if args.is_training: raw_dataset = load_dataset('squad', split='train') else: raw_dataset = load_dataset('squad', split='validation') column_names = raw_dataset.column_names dataset = raw_dataset.map(partial( features_fn, tokenizer=tokenizer, args=args), batched=True, remove_columns=column_names, num_proc=4) bs = args.micro_batch_size * args.grad_acc_factor * args.batches_per_step * args.num_replica args.batch_size = bs if args.is_training: train_batch_sampler = BatchSampler( dataset, batch_size=bs, shuffle=args.shuffle, drop_last=True) else: train_batch_sampler = BatchSampler( dataset, batch_size=bs, shuffle=args.shuffle, drop_last=False) if args.is_training: collate_fn = lambda samples, fn=Dict({ "input_ids": Stack(), "token_type_ids": Stack(), "position_ids": Stack(), "input_mask": Stack(), "start_positions": Stack(), "end_positions": Stack() }): fn(samples) else: collate_fn = lambda samples, fn=Dict({ "input_ids": Stack(), "token_type_ids": Stack(), "position_ids": Stack(), "input_mask": Stack()}): fn(samples) data_loader = DataLoader( dataset=dataset, batch_sampler=train_batch_sampler, collate_fn=collate_fn, return_list=True) return raw_dataset, data_loader
def __init__(self, model_config): """ :param vocab_path: """ super(ErnieInputEncoderV2, self).__init__() self.config = model_config self.enc_value_with_col = model_config.enc_value_with_col if model_config.pretrain_model_type == 'BERT': self.tokenizer = BertTokenizer.from_pretrained(model_config.pretrain_model) self.special_token_dict = { # span type 'table': '[unused1]', 'column': '[unused2]', 'value': '[unused3]', # column data type 'text': '[unused11]', 'real': '[unused12]', 'number': '[unused13]', 'time': '[unused14]', 'binary': '[unused15]', 'boolean': '[unused16]', 'bool': '[unused17]', 'others': '[unused18]', } else: self.tokenizer = ErnieTokenizer.from_pretrained(model_config.pretrain_model) # 低频token作为特殊标记 ### 其它候选: overchicstoretvhome self.special_token_dict = { # span type 'table': 'blogabstract', 'column': 'wx17house', 'value': 'fluke62max', # column data type 'text': 'googlemsn', 'real': 'sputniknews', 'number': 'sputniknews', 'time': 'pixstyleme3c', 'binary': 'pixnetfacebookyahoo', 'boolean': 'pixnetfacebookyahoo', 'bool': 'pixnetfacebookyahoo', 'others': 'ubuntuforumwikilinuxpastechat', } self._need_bool_value = True if self.config.grammar_type != 'nl2sql' else False
def bert(model_name_or_path='bert-base-uncased', model_select='sequence_classification'): """ Returns BERT model and tokenizer from given pretrained model name or path and class type of tasks, such as sequence classification. Args: model_name_or_path (str, optional): A name of or a file path to a pretrained model. It could be 'bert-base-uncased', 'bert-large-uncased', 'bert-base-multilingual-uncased', 'bert-base-cased', 'bert-base-chinese', 'bert-large-cased', 'bert-base-multilingual-cased', 'bert-wwm-chinese' or 'bert-wwm-ext-chinese'. Default: 'bert-base-uncased'. model_select (str, optional): model class to select. It could be 'bert', 'sequence_classification', 'token_classification', 'question_answering' or 'pretraining'. If 'sequence_classification' is chosen, model class would be `BertForSequenceClassification`. The document of BERT model could be seen at `bert.modeling <https://paddlenlp.readthedocs.io/zh/latest/source/paddlenlp.transformers.bert.modeling.html>`_ Default: 'sequence_classification'. Returns: tuple: Returns the pretrained bert model and bert tokenizer. Example: .. code-block:: python import paddle.hub as hub model, tokenizer = hub.load('PaddlePaddle/PaddleNLP:develop', model='bert', model_name_or_path='bert-base-cased') """ assert model_name_or_path in _BERT_PRETRAINED_MODELS or os.path.isdir(model_name_or_path), \ "Please check your model name or path. Supported model names are: {}.".format(tuple(_BERT_PRETRAINED_MODELS)) assert model_select in _BERT_MODEL_CLASSES.keys(), \ "Please check `model_select`, it should be in {}.".format(tuple(_BERT_MODEL_CLASSES.keys())) model_class = _BERT_MODEL_CLASSES[model_select] model = model_class.from_pretrained(model_name_or_path) tokenizer = BertTokenizer.from_pretrained(model_name_or_path) return model, tokenizer
def create_data_loader_for_small_model(task_name, vocab_path, model_name=None, batch_size=64, max_seq_length=128, shuffle=True): """Data loader for bi-lstm, not bert.""" dataset_class = TASK_CLASSES[task_name] train_ds, dev_ds = dataset_class.get_datasets(['train', 'dev']) if task_name == 'senta': vocab = Vocab.load_vocabulary( vocab_path, unk_token='[UNK]', pad_token='[PAD]', bos_token=None, eos_token=None, ) pad_val = vocab['[PAD]'] else: vocab = BertTokenizer.from_pretrained(model_name) pad_val = vocab.pad_token_id trans_fn = partial(convert_small_example, task_name=task_name, vocab=vocab, max_seq_length=max_seq_length, is_test=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=pad_val), # input_ids Stack(dtype="int64"), # seq len Stack(dtype="int64") # label ): [data for data in fn(samples)] train_ds = train_ds.apply(trans_fn, lazy=True) dev_ds = dev_ds.apply(trans_fn, lazy=True) train_data_loader, dev_data_loader = create_dataloader( train_ds, dev_ds, batch_size, batchify_fn, shuffle) return train_data_loader, dev_data_loader
def create_pair_loader_for_small_model(task_name, model_name, vocab_path, batch_size=64, max_seq_length=128, shuffle=True, is_test=False): """Only support QQP now.""" tokenizer = BertTokenizer.from_pretrained(model_name) dataset_class = TASK_CLASSES[task_name] train_ds, dev_ds = dataset_class.get_datasets(['train', 'dev']) vocab = Vocab.load_vocabulary( vocab_path, unk_token='[UNK]', pad_token='[PAD]', bos_token=None, eos_token=None, ) trans_func = partial(convert_pair_example, task_name=task_name, vocab=tokenizer, is_tokenized=False, max_seq_length=max_seq_length, is_test=is_test) train_ds = train_ds.apply(trans_func, lazy=True) dev_ds = dev_ds.apply(trans_func, lazy=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=vocab['[PAD]']), # input Stack(), # length Pad(axis=0, pad_val=vocab['[PAD]']), # input Stack(), # length Stack(dtype="int64" if train_ds.get_labels() else "float32") # label ): [data for i, data in enumerate(fn(samples))] train_data_loader, dev_data_loader = create_dataloader( train_ds, dev_ds, batch_size, batchify_fn, shuffle) return train_data_loader, dev_data_loader
def create_pair_loader_for_small_model(task_name, model_name, vocab_path, batch_size=64, max_seq_length=128, shuffle=True, is_test=False): """Only support QQP now.""" tokenizer = BertTokenizer.from_pretrained(model_name) train_ds, dev_ds = load_dataset('glue', task_name, splits=["train", "dev"]) vocab = Vocab.load_vocabulary( vocab_path, unk_token='[UNK]', pad_token='[PAD]', bos_token=None, eos_token=None, ) trans_func = partial(convert_pair_example, task_name=task_name, vocab=tokenizer, is_tokenized=False, max_seq_length=max_seq_length, is_test=is_test) train_ds = train_ds.map(trans_func, lazy=True) dev_ds = dev_ds.map(trans_func, lazy=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=vocab['[PAD]']), # input Stack(), # length Pad(axis=0, pad_val=vocab['[PAD]']), # input Stack(), # length Stack(dtype="int64" if train_ds.label_list else "float32") # label ): fn(samples) train_data_loader, dev_data_loader = create_dataloader( train_ds, dev_ds, batch_size, batchify_fn, shuffle) return train_data_loader, dev_data_loader
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--input_file", default=None, type=str, required=True, help="The input train corpus. can be directory with .txt files or a path to a single file" ) parser.add_argument( "--output_file", default=None, type=str, required=True, help="The output file where created hdf5 formatted data will be written." ) parser.add_argument( "--vocab_file", default=None, type=str, required=False, help="The vocabulary the BERT model will train on. " "Use bert_model argument would ignore this. " "The bert_model argument is recommended.") parser.add_argument( "--do_lower_case", action='store_true', default=True, help="Whether to lower case the input text. True for uncased models, False for cased models. " "Use bert_model argument would ignore this. The bert_model argument is recommended." ) parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, required=False, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." "If provided, use the pre-trained model used tokenizer to create data " "and ignore vocab_file and do_lower_case.") ## Other parameters #int parser.add_argument( "--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--dupe_factor", default=10, type=int, help="Number of times to duplicate the input data (with different masks)." ) parser.add_argument( "--max_predictions_per_seq", default=20, type=int, help="Maximum number of masked LM predictions per sequence.") # floats parser.add_argument( "--masked_lm_prob", default=0.15, type=float, help="Masked LM probability.") parser.add_argument( "--short_seq_prob", default=0.1, type=float, help="Probability to create a sequence shorter than maximum sequence length" ) parser.add_argument( '--random_seed', type=int, default=12345, help="random seed for initialization") args = parser.parse_args() print(args) if args.bert_model: tokenizer = BertTokenizer.from_pretrained(args.bert_model) else: assert args.vocab_file, ( "vocab_file must be set If bert_model is not provided.") tokenizer = BertTokenizer( args.vocab_file, do_lower_case=args.do_lower_case) input_files = [] if os.path.isfile(args.input_file): input_files.append(args.input_file) elif os.path.isdir(args.input_file): input_files = [ os.path.join(args.input_file, f) for f in os.listdir(args.input_file) if (os.path.isfile(os.path.join(args.input_file, f)) and f.endswith( '.txt')) ] else: raise ValueError("{} is not a valid path".format(args.input_file)) rng = random.Random(args.random_seed) instances = create_training_instances( input_files, tokenizer, args.max_seq_length, args.dupe_factor, args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq, rng) output_file = args.output_file write_instance_to_example_file(instances, tokenizer, args.max_seq_length, args.max_predictions_per_seq, output_file)
def __init__(self, model_name, param_path): self.tokenizer = BertTokenizer.from_pretrained(model_name) self.model = BertForSequenceClassification.from_pretrained(model_name) self.model.set_state_dict(paddle.load(param_path)) self.model.eval()
def evaluate(): paddle.set_device("gpu" if args.use_gpu else "cpu") model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) dev_dataset = Poetry.get_datasets(['dev']) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example(tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) data_loader = DataLoader(dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) rouge1 = Rouge1() rouge2 = Rouge2() if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) model.eval() vocab = tokenizer.vocab eos_id = vocab[tokenizer.sep_token] sos_id = vocab[tokenizer.cls_token] pad_id = vocab[tokenizer.pad_token] unk_id = vocab[tokenizer.unk_token] vocab_size = len(vocab) evaluated_sentences_ids = [] reference_sentences_ids = [] logger.info("Evaluating...") for data in tqdm(data_loader): (src_ids, src_sids, src_pids, _, _, _, _, _, _, _, _, raw_tgt_labels) = data # never use target when infer # Use greedy_search_infilling or beam_search_infilling to get predictions output_ids = beam_search_infilling(model, src_ids, src_sids, eos_id=eos_id, sos_id=sos_id, attn_id=attn_id, pad_id=pad_id, unk_id=unk_id, vocab_size=vocab_size, max_decode_len=args.max_decode_len, max_encode_len=args.max_encode_len, beam_width=args.beam_width, length_penalty=args.length_penalty, tgt_type_id=tgt_type_id) for ids in output_ids.tolist(): if eos_id in ids: ids = ids[:ids.index(eos_id)] evaluated_sentences_ids.append(ids) for ids in raw_tgt_labels.numpy().tolist(): ids = ids[:ids.index(eos_id)] reference_sentences_ids.append(ids) score1 = rouge1.score(evaluated_sentences_ids, reference_sentences_ids) score2 = rouge2.score(evaluated_sentences_ids, reference_sentences_ids) logger.info("Rouge-1: %.5f ,Rouge-2: %.5f" % (score1 * 100, score2 * 100))
from paddlenlp.datasets.experimental import SQuAD from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer from paddlenlp.data import Stack, Tuple, Pad, Dict from functools import partial from paddle.io import DataLoader from paddlenlp.datasets.experimental import load_dataset train_ds, dev_ds = load_dataset('squad', splits=('train_v2', 'dev_v2')) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') print(len(train_ds)) print(len(dev_ds)) print(train_ds[0]) print('-----------------------------------------------------------') def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. contexts = [examples[i]['context'] for i in range(5000)] questions = [examples[i]['question'] for i in range(5000)] tokenized_examples = tokenizer( questions, contexts, stride=128, max_seq_len=384) print(len(tokenized_examples)) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that.
def do_predict(args): paddle.set_device("gpu" if args.use_gpu else "cpu") train_dataset, predict_dataset = ppnlp.datasets.MSRA_NER.get_datasets( ["train", "test"]) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_dataset.get_labels() label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(convert_example, tokenizer=tokenizer, label_list=label_list, no_entity_id=label_num - 1, max_seq_length=args.max_seq_length) ignore_label = -100 batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment Stack(), # length Pad(axis=0, pad_val=ignore_label) # label ): fn(samples) raw_data = predict_dataset.data id2label = dict(enumerate(predict_dataset.get_labels())) predict_dataset = predict_dataset.apply(trans_func, lazy=True) predict_batch_sampler = paddle.io.BatchSampler(predict_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) predict_data_loader = DataLoader(dataset=predict_dataset, batch_sampler=predict_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) model.eval() pred_list = [] len_list = [] for step, batch in enumerate(predict_data_loader): input_ids, segment_ids, length, labels = batch logits = model(input_ids, segment_ids) pred = paddle.argmax(logits, axis=-1) pred_list.append(pred.numpy()) len_list.append(length.numpy()) preds = parse_decodes(raw_data, id2label, pred_list, len_list) file_path = "results.txt" with open(file_path, "w", encoding="utf8") as fout: fout.write("\n".join(preds)) # Print some examples print( "The results have been saved in the file: %s, some examples are shown below: " % file_path) print("\n".join(preds[:10]))
def run(self): args = self.args paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Reads label_map. # read B I O的id label_map_path = os.path.join(args.data_path, "predicate2id.json") if not (os.path.exists(label_map_path) and os.path.isfile(label_map_path)): sys.exit( "{} dose not exists or is not a file.".format(label_map_path)) with open(label_map_path, 'r', encoding='utf8') as fp: label_map = json.load(fp) # dict num_classes = (len(label_map.keys()) - 2) * 2 + 2 # 由于object和subject的区别 B标签*2+2 # Loads pretrained model BERT model = BertForTokenClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes) model = paddle.DataParallel(model) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) criterion = BCELossForDuIE() # Loads dataset. train_dataset = DuIEDataset.from_file( os.path.join(args.data_path, 'train_data.json'), tokenizer, args.max_seq_length, True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) collator = DataCollator() train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=collator, return_list=True) eval_file_path = os.path.join(args.data_path, 'dev_data.json') test_dataset = DuIEDataset.from_file(eval_file_path, tokenizer, args.max_seq_length, True) test_batch_sampler = paddle.io.BatchSampler(test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) test_data_loader = DataLoader(dataset=test_dataset, batch_sampler=test_batch_sampler, collate_fn=collator, return_list=True) # Defines learning rate strategy. steps_by_epoch = len(train_data_loader) num_training_steps = steps_by_epoch * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_ratio) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) # Starts training. global_step = 0 logging_steps = 50 save_steps = 10000 tic_train = time.time() for epoch in range(args.num_train_epochs): print("\n=====start training of %d epochs=====" % epoch) tic_epoch = time.time() model.train() for step, batch in enumerate(train_data_loader): input_ids, seq_lens, tok_to_orig_start_index, tok_to_orig_end_index, labels = batch logits = model(input_ids=input_ids) mask = (input_ids != 0).logical_and( (input_ids != 1)).logical_and((input_ids != 2)) loss = criterion(logits, labels, mask) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() loss_item = loss.numpy().item() global_step += 1 if global_step % logging_steps == 0 and rank == 0: print( "epoch: %d / %d, steps: %d / %d, loss: %f, speed: %.2f step/s" % (epoch, args.num_train_epochs, step, steps_by_epoch, loss_item, logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % save_steps == 0 and rank == 0: print("\n=====start evaluating ckpt of %d steps=====" % global_step) precision, recall, f1 = self.evaluate( model, criterion, test_data_loader, eval_file_path, "eval") print("precision: %.2f\t recall: %.2f\t f1: %.2f\t" % (100 * precision, 100 * recall, 100 * f1)) print("saving checkpoing model_%d.pdparams to %s " % (global_step, args.output_dir)) paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) model.train() # back to train mode tic_epoch = time.time() - tic_epoch print("epoch time footprint: %d hour %d min %d sec" % (tic_epoch // 3600, (tic_epoch % 3600) // 60, tic_epoch % 60)) # Does final evaluation. if rank == 0: print("\n=====start evaluating last ckpt of %d steps=====" % global_step) precision, recall, f1 = self.evaluate(model, criterion, test_data_loader, eval_file_path, "eval") print("precision: %.2f\t recall: %.2f\t f1: %.2f\t" % (100 * precision, 100 * recall, 100 * f1)) paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) print("\n=====training complete=====")
def train(): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) train_dataset, dev_dataset = Poetry.get_datasets(['train', 'dev']) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example(tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len, noise_prob=args.noise_prob, use_random_noice=args.use_random_noice) train_dataset = train_dataset.apply(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) label_num = model.word_emb.weight.shape[0] if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) max_steps = len(train_data_loader) * args.num_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_steps, args.warmup_proportion) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=nn.ClipGradByGlobalNorm(1.0), apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) rouge1 = Rouge1() rouge2 = Rouge2() global_step = 1 tic_train = time.time() for epoch in range(args.num_epochs): for step, batch in enumerate(train_data_loader, start=1): (src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, _) = batch # import pdb; pdb.set_trace() _, __, info = model(src_ids, sent_ids=src_sids, pos_ids=src_pids, attn_bias=mask_src_2_src, encode_only=True) cached_k, cached_v = info['caches'] _, __, info = model(tgt_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_tgt_2_srctgt, past_cache=(cached_k, cached_v), encode_only=True) cached_k2, cached_v2 = info['caches'] past_cache_k = [ paddle.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2) ] past_cache_v = [ paddle.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2) ] if args.label_smooth > 0.: tgt_labels = nn.functional.label_smooth( nn.functional.one_hot(tgt_labels, label_num), epsilon=args.label_smooth) loss, _, __ = model(attn_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_attn_2_srctgtattn, past_cache=(past_cache_k, past_cache_v), tgt_labels=tgt_labels, tgt_pos=paddle.nonzero(attn_ids == attn_id)) if global_step % args.logging_steps == 0: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, lr: %.3e" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train), lr_scheduler.get_lr())) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.save_steps == 0 and ( (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0): evaluate(model, dev_data_loader, tokenizer, rouge1, rouge2, attn_id, tgt_type_id, args) output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) global_step += 1
def predict(): paddle.set_device("gpu" if args.use_gpu else "cpu") model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) dev_dataset = Poetry.get_datasets(['dev']) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example(tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) dev_dataset = dev_dataset.apply(trans_func, lazy=True) test_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) data_loader = DataLoader(dataset=dev_dataset, batch_sampler=test_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) model.eval() vocab = tokenizer.vocab eos_id = vocab[tokenizer.sep_token] sos_id = vocab[tokenizer.cls_token] pad_id = vocab[tokenizer.pad_token] unk_id = vocab[tokenizer.unk_token] vocab_size = len(vocab) evaluated_sentences = [] evaluated_sentences_ids = [] logger.info("Predicting...") for data in data_loader: (src_ids, src_sids, src_pids, _, _, _, _, _, _, _, _, raw_tgt_labels) = data # never use target when infer # Use greedy_search_infilling or beam_search_infilling to get predictions output_ids = beam_search_infilling(model, src_ids, src_sids, eos_id=eos_id, sos_id=sos_id, attn_id=attn_id, pad_id=pad_id, unk_id=unk_id, vocab_size=vocab_size, max_decode_len=args.max_decode_len, max_encode_len=args.max_encode_len, beam_width=args.beam_width, length_penalty=args.length_penalty, tgt_type_id=tgt_type_id) for source_ids, target_ids, predict_ids in zip( src_ids.numpy().tolist(), raw_tgt_labels.numpy().tolist(), output_ids.tolist()): if eos_id in predict_ids: predict_ids = predict_ids[:predict_ids.index(eos_id)] source_sentence = ''.join( map(post_process, vocab.to_tokens(source_ids[1:source_ids.index(eos_id)]))) tgt_sentence = ''.join( map(post_process, vocab.to_tokens(target_ids[1:target_ids.index(eos_id)]))) predict_ids = ''.join( map(post_process, vocab.to_tokens(predict_ids))) print("source :%s\ntarget :%s\npredict:%s\n" % (source_sentence, tgt_sentence, predict_ids))
def create_distill_loader(task_name, model_name, vocab_path, batch_size=64, max_seq_length=128, shuffle=True, n_iter=20, whole_word_mask=False, seed=0): """ Returns batch data for bert and small model. Bert and small model have different input representations. """ dataset_class = TASK_CLASSES[task_name] train_ds, dev_ds = dataset_class.get_datasets(['train', 'dev']) tokenizer = BertTokenizer.from_pretrained(model_name) if task_name == 'senta': vocab = Vocab.load_vocabulary( vocab_path, unk_token='[UNK]', pad_token='[PAD]', bos_token=None, eos_token=None, ) pad_val = vocab['[PAD]'] else: vocab = tokenizer pad_val = tokenizer.pad_token_id if task_name == 'senta': train_ds = apply_data_augmentation_for_cn(train_ds, tokenizer, vocab, n_iter=n_iter, seed=seed) else: train_ds = apply_data_augmentation(task_name, train_ds, tokenizer, n_iter=n_iter, whole_word_mask=whole_word_mask, seed=seed) print("Data augmentation has been applied.") trans_fn = partial(convert_two_example, task_name=task_name, tokenizer=tokenizer, label_list=train_ds.get_labels(), max_seq_length=max_seq_length, vocab=vocab) trans_fn_dev = partial(convert_two_example, task_name=task_name, tokenizer=tokenizer, label_list=train_ds.get_labels(), max_seq_length=max_seq_length, vocab=vocab, is_tokenized=False) if task_name == 'qqp': batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # bert input Pad(axis=0, pad_val=tokenizer.pad_token_id), # bert segment Pad(axis=0, pad_val=pad_val), # small input_ids Stack(dtype="int64"), # small seq len Pad(axis=0, pad_val=pad_val), # small input_ids Stack(dtype="int64"), # small seq len Stack(dtype="int64") # small label ): [data for data in fn(samples)] else: batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # bert input Pad(axis=0, pad_val=tokenizer.pad_token_id), # bert segment Pad(axis=0, pad_val=pad_val), # small input_ids Stack(dtype="int64"), # small seq len Stack(dtype="int64") # small label ): [data for data in fn(samples)] train_ds = train_ds.apply(trans_fn, lazy=True) dev_ds = dev_ds.apply(trans_fn_dev, lazy=True) train_data_loader, dev_data_loader = create_dataloader( train_ds, dev_ds, batch_size, batchify_fn, shuffle) return train_data_loader, dev_data_loader
def do_predict(args): paddle.set_device("gpu" if args.use_gpu else "cpu") train_ds, predict_ds = load_dataset('msra_ner', splits=('train', 'test'), lazy=False) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) ignore_label = -100 batchify_fn = lambda samples, fn=Dict( { 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id ), # segment 'seq_len': Stack(), 'labels': Pad(axis=0, pad_val=ignore_label) # label }): fn(samples) raw_data = predict_ds.data id2label = dict(enumerate(predict_ds.label_list)) predict_ds = predict_ds.map(trans_func) predict_data_loader = DataLoader(dataset=predict_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) model.eval() pred_list = [] len_list = [] for step, batch in enumerate(predict_data_loader): input_ids, token_type_ids, length, labels = batch logits = model(input_ids, token_type_ids) pred = paddle.argmax(logits, axis=-1) pred_list.append(pred.numpy()) len_list.append(length.numpy()) preds = parse_decodes(raw_data, id2label, pred_list, len_list) file_path = "results.txt" with open(file_path, "w", encoding="utf8") as fout: fout.write("\n".join(preds)) # Print some examples print( "The results have been saved in the file: %s, some examples are shown below: " % file_path) print("\n".join(preds[:10]))
for batch_data in batches: encoded_inputs = hf_tokenizer(batch_data) # BERT Tokenizer using HuggingFace AutoTokenizer start = time.time() for _ in range(epochs): for batch_data in batches: encoded_inputs = hf_tokenizer( batch_data) #, padding=True, truncation=True) end = time.time() print("The throughput of huggingface FasterTokenizer: {:,.2f} tokens/s".format( (total_tokens / (end - start)))) # BERT Tokenizer using PaddleNLP BertTokenizer py_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") for batch_data in batches: encoded_inputs = py_tokenizer(batch_data) start = time.time() for _ in range(epochs): for batch_data in batches: encoded_inputs = py_tokenizer(batch_data) end = time.time() print("The throughput of paddle BertTokenizer: {:,.2f} tokens/s".format( (total_tokens / (end - start)))) # BERT Tokenizer using HuggingFace AutoTokenizer hf_tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese", use_fast=False)
def do_eval(args): paddle.set_device(args.device) # Create dataset, tokenizer and dataloader. train_ds, eval_ds = load_dataset('msra_ner', split=('train', 'test')) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.features['ner_tags'].feature.names label_num = len(label_list) no_entity_id = 0 def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples['tokens'], max_seq_len=args.max_seq_length, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, return_length=True) labels = [] for i, label in enumerate(examples['ner_tags']): label_ids = label if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids): label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) - 2] label_ids = [no_entity_id] + label_ids + [no_entity_id] label_ids += [no_entity_id] * ( len(tokenized_inputs['input_ids'][i]) - len(label_ids)) labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs ignore_label = -100 batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32' ), # segment 'seq_len': Stack(dtype='int64'), 'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64') # label }): fn(samples) eval_ds = eval_ds.select(range(len(eval_ds) - 1)) eval_ds = eval_ds.map(tokenize_and_align_labels, batched=True) eval_data_loader = DataLoader(dataset=eval_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) model.eval() metric.reset() for step, batch in enumerate(eval_data_loader): input_ids, token_type_ids, length, labels = batch logits = model(input_ids, token_type_ids) loss = loss_fct(logits, labels) avg_loss = paddle.mean(loss) preds = logits.argmax(axis=2) num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( length, preds, labels) metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) precision, recall, f1_score = metric.accumulate() print("eval loss: %f, precision: %f, recall: %f, f1: %f" % (avg_loss, precision, recall, f1_score))
def do_predict(args): paddle.set_device(args.device) # Create dataset, tokenizer and dataloader. train_examples, predict_examples = load_dataset('msra_ner', split=('train', 'test')) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_examples.features['ner_tags'].feature.names label_num = len(label_list) no_entity_id = 0 def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples['tokens'], max_seq_len=args.max_seq_length, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, return_length=True) labels = [] for i, label in enumerate(examples['ner_tags']): label_ids = label if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids): label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) - 2] label_ids = [no_entity_id] + label_ids + [no_entity_id] label_ids += [no_entity_id] * ( len(tokenized_inputs['input_ids'][i]) - len(label_ids)) labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs ignore_label = -100 batchify_fn = lambda samples, fn=Dict( { 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id ), # segment 'seq_len': Stack(), 'labels': Pad(axis=0, pad_val=ignore_label) # label }): fn(samples) id2label = dict(enumerate(label_list)) predict_examples = predict_examples.select( range(len(predict_examples) - 1)) predict_ds = predict_examples.map(tokenize_and_align_labels, batched=True) predict_data_loader = DataLoader(dataset=predict_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) model.eval() pred_list = [] len_list = [] for step, batch in enumerate(predict_data_loader): input_ids, token_type_ids, length, labels = batch logits = model(input_ids, token_type_ids) pred = paddle.argmax(logits, axis=-1) pred_list.append(pred.numpy()) len_list.append(length.numpy()) preds = parse_decodes(predict_examples, id2label, pred_list, len_list) file_path = "results.txt" with open(file_path, "w", encoding="utf8") as fout: fout.write("\n".join(preds)) # Print some examples print( "The results have been saved in the file: %s, some examples are shown below: " % file_path) print("\n".join(preds[:10]))
def train(): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) train_dataset, dev_dataset = load_dataset( 'poetry', splits=('train', 'dev'), lazy=False) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example( tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len, noise_prob=args.noise_prob, use_random_noice=args.use_random_noice) train_dataset = train_dataset.map(trans_func) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # src_tids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # tgt_tids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) train_data_loader = DataLoader( dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_dataset = dev_dataset.map(trans_func) dev_data_loader = DataLoader( dataset=dev_dataset, batch_size=args.batch_size, collate_fn=batchify_fn, num_workers=0, return_list=True) label_num = model.word_emb.weight.shape[0] train_model = StackModel(model) if paddle.distributed.get_world_size() > 1: # All 'forward' outputs derived from the module parameters using in DataParallel # must participate in the calculation of losses and subsequent gradient calculations. # So we use StackModel here to make the model only output loss in its 'forward' function. train_model = paddle.DataParallel(train_model) max_steps = len(train_data_loader) * args.num_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=nn.ClipGradByGlobalNorm(1.0), apply_decay_param_fun=lambda x: x in decay_params) rouge1 = Rouge1() rouge2 = Rouge2() global_step = 1 tic_train = time.time() for epoch in range(args.num_epochs): for step, batch in enumerate(train_data_loader, start=1): (src_ids, src_tids, src_pids, tgt_ids, tgt_tids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, _) = batch # import pdb; pdb.set_trace() if args.label_smooth > 0.: tgt_labels = nn.functional.label_smooth( nn.functional.one_hot(tgt_labels, label_num), epsilon=args.label_smooth) tgt_pos = paddle.nonzero(attn_ids == attn_id) loss = train_model(src_ids, src_tids, src_pids, tgt_ids, tgt_tids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, tgt_pos) if global_step % args.logging_steps == 0: if paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, lr: %.3e" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train), lr_scheduler.get_lr())) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 and paddle.distributed.get_rank( ) == 0: evaluate(model, dev_data_loader, tokenizer, rouge1, rouge2, attn_id, tgt_type_id, args) output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) global_step += 1
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() train_dataset, dev_dataset = ppnlp.datasets.MSRA_NER.get_datasets( ["train", "dev"]) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_dataset.get_labels() label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(convert_example, tokenizer=tokenizer, label_list=label_list, no_entity_id=label_num - 1, max_seq_length=args.max_seq_length) train_dataset = train_dataset.apply(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) ignore_label = -100 batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment Stack(), # length Pad(axis=0, pad_val=ignore_label) # label ): fn(samples) train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) dev_data_loader = DataLoader(dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) lr_scheduler = paddle.optimizer.lr.LambdaDecay( args.learning_rate, lambda current_step, num_warmup_steps=args.warmup_steps, num_training_steps=args.max_steps if args.max_steps > 0 else (len(train_data_loader) * args.num_train_epochs): float( current_step) / float(max(1, num_warmup_steps)) if current_step < num_warmup_steps else max( 0.0, float(num_training_steps - current_step) / float( max(1, num_training_steps - num_warmup_steps)))) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(int(math.ceil((label_num + 1) / 2.0)), "IOB") global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): input_ids, segment_ids, length, labels = batch logits = model(input_ids, segment_ids) loss = loss_fct(logits.reshape([-1, label_num]), labels.reshape([-1])) avg_loss = paddle.mean(loss) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, avg_loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() avg_loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.save_steps == 0: evaluate(model, loss_fct, metric, dev_data_loader, label_num) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) global_step += 1
def convert_example_to_feature( example, tokenizer: BertTokenizer, chineseandpunctuationextractor: ChineseAndPunctuationExtractor, label_map, max_length: Optional[int] = 512, pad_to_max_length: Optional[bool] = None): spo_list = example['spo_list'] if "spo_list" in example.keys() else None text_raw = example['text'] sub_text = [] buff = "" for char in text_raw: if chineseandpunctuationextractor.is_chinese_or_punct(char): if buff != "": sub_text.append(buff) buff = "" sub_text.append(char) else: buff += char if buff != "": sub_text.append(buff) tok_to_orig_start_index = [] tok_to_orig_end_index = [] orig_to_tok_index = [] tokens = [] text_tmp = '' for (i, token) in enumerate(sub_text): orig_to_tok_index.append(len(tokens)) sub_tokens = tokenizer._tokenize(token) text_tmp += token for sub_token in sub_tokens: tok_to_orig_start_index.append(len(text_tmp) - len(token)) tok_to_orig_end_index.append(len(text_tmp) - 1) tokens.append(sub_token) if len(tokens) >= max_length - 2: break else: continue break seq_len = len(tokens) # 2 tags for each predicate + I tag + O tag num_labels = 2 * (len(label_map.keys()) - 2) + 2 # initialize tag labels = [[0] * num_labels for i in range(seq_len)] if spo_list is not None: labels = parse_label(spo_list, label_map, tokens, tokenizer) # add [CLS] and [SEP] token, they are tagged into "O" for outside if seq_len > max_length - 2: tokens = tokens[0:(max_length - 2)] labels = labels[0:(max_length - 2)] tok_to_orig_start_index = tok_to_orig_start_index[0:(max_length - 2)] tok_to_orig_end_index = tok_to_orig_end_index[0:(max_length - 2)] tokens = ["[CLS]"] + tokens + ["[SEP]"] # "O" tag for [PAD], [CLS], [SEP] token outside_label = [[1] + [0] * (num_labels - 1)] labels = outside_label + labels + outside_label tok_to_orig_start_index = [-1] + tok_to_orig_start_index + [-1] tok_to_orig_end_index = [-1] + tok_to_orig_end_index + [-1] if seq_len < max_length: tokens = tokens + ["[PAD]"] * (max_length - seq_len - 2) labels = labels + outside_label * (max_length - len(labels)) tok_to_orig_start_index = tok_to_orig_start_index + [-1] * ( max_length - len(tok_to_orig_start_index)) tok_to_orig_end_index = tok_to_orig_end_index + [-1] * ( max_length - len(tok_to_orig_end_index)) token_ids = tokenizer.convert_tokens_to_ids(tokens) return InputFeature( input_ids=np.array(token_ids), seq_len=np.array(seq_len), tok_to_orig_start_index=np.array(tok_to_orig_start_index), tok_to_orig_end_index=np.array(tok_to_orig_end_index), labels=np.array(labels), )
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Create dataset, tokenizer and dataloader. train_ds, test_ds = load_dataset('msra_ner', splits=('train', 'test'), lazy=False) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) train_ds = train_ds.map(trans_func) ignore_label = -100 batchify_fn = lambda samples, fn=Dict( { 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id ), # segment 'seq_len': Stack(), # seq_len 'labels': Pad(axis=0, pad_val=ignore_label) # label }): fn(samples) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_data_loader = DataLoader(dataset=train_ds, collate_fn=batchify_fn, num_workers=0, batch_sampler=train_batch_sampler, return_list=True) test_ds = test_ds.map(trans_func) test_data_loader = DataLoader(dataset=test_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) global_step = 0 last_step = args.num_train_epochs * len(train_data_loader) tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, _, labels = batch logits = model(input_ids, token_type_ids) loss = loss_fct(logits, labels) avg_loss = paddle.mean(loss) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, avg_loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() avg_loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == last_step: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: evaluate(model, loss_fct, metric, test_data_loader, label_num) paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step))
def main(args): paddle.set_device('gpu' if args.n_gpu else 'cpu') world_size = dist.get_world_size() rank = dist.get_rank() if world_size > 1 and args.do_train: dist.init_parallel_env() set_seed(args.seed) dataset_class, metric_class = TASK_CLASSES[args.task_name] tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial(dataset_class.convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_len) test_trans_func = partial(dataset_class.convert_example, tokenizer=tokenizer, max_seq_length=args.test_max_seq_len) metric = metric_class() if args.task_name in ('udc', 'dstc2', 'atis_intent', 'mrda', 'swda'): batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment Stack(dtype='int64') # label ): fn(samples) model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=dataset_class.num_classes()) elif args.task_name == 'atis_slot': batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment Pad(axis=0, pad_val=0, dtype='int64') # label ): fn(samples) model = BertForTokenClassification.from_pretrained( args.model_name_or_path, num_classes=dataset_class.num_classes(), dropout=0.0) if world_size > 1 and args.do_train: model = paddle.DataParallel(model) if args.do_train: train_data_loader = create_data_loader(args, dataset_class, trans_func, batchify_fn, 'train') if args.do_eval: dev_data_loader = create_data_loader(args, dataset_class, test_trans_func, batchify_fn, 'dev') else: dev_data_loader = None train(args, model, train_data_loader, dev_data_loader, metric, rank) if args.do_test: if rank == 0: test_data_loader = create_data_loader(args, dataset_class, test_trans_func, batchify_fn, 'test') if args.do_train: # If do_eval=True, use best model to evaluate the test data. # Otherwise, use final model to evaluate the test data. if args.do_eval: args.init_from_ckpt = os.path.join(args.output_dir, 'best') load_ckpt(args, model) else: if not args.init_from_ckpt: raise ValueError('"init_from_ckpt" should be set.') load_ckpt(args, model) print('\nTest begin...') evaluation(args, model, test_data_loader, metric)
def _construct_tokenizer(self, model): """ Construct the tokenizer for the predictor. """ self._tokenizer = BertTokenizer.from_pretrained(model)