def do_eval(args): paddle.set_device(args.device) # Create dataset, tokenizer and dataloader. train_ds, eval_ds = load_dataset('msra_ner', splits=('train', 'test'), lazy=False) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) ignore_label = -100 batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32' ), # segment 'seq_len': Stack(dtype='int64'), 'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64') # label }): fn(samples) eval_ds = eval_ds.map(trans_func) eval_data_loader = DataLoader(dataset=eval_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) model.eval() metric.reset() for step, batch in enumerate(eval_data_loader): input_ids, token_type_ids, length, labels = batch logits = model(input_ids, token_type_ids) loss = loss_fct(logits, labels) avg_loss = paddle.mean(loss) preds = logits.argmax(axis=2) num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( length, preds, labels) metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) precision, recall, f1_score = metric.accumulate() print("eval loss: %f, precision: %f, recall: %f, f1: %f" % (avg_loss, precision, recall, f1_score))
def main(args): paddle.set_device('gpu' if args.n_gpu else 'cpu') world_size = dist.get_world_size() rank = dist.get_rank() if world_size > 1 and args.do_train: dist.init_parallel_env() set_seed(args.seed) dataset_class, metric_class = TASK_CLASSES[args.task_name] tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial(dataset_class.convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_len) test_trans_func = partial(dataset_class.convert_example, tokenizer=tokenizer, max_seq_length=args.test_max_seq_len) metric = metric_class() if args.task_name in ('udc', 'dstc2', 'atis_intent', 'mrda', 'swda'): batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment Stack(dtype='int64') # label ): fn(samples) model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=dataset_class.num_classes()) elif args.task_name == 'atis_slot': batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment Pad(axis=0, pad_val=0, dtype='int64') # label ): fn(samples) model = BertForTokenClassification.from_pretrained( args.model_name_or_path, num_classes=dataset_class.num_classes(), dropout=0.0) if world_size > 1 and args.do_train: model = paddle.DataParallel(model) if args.do_train: train_data_loader = create_data_loader(args, dataset_class, trans_func, batchify_fn, 'train') if args.do_eval: dev_data_loader = create_data_loader(args, dataset_class, test_trans_func, batchify_fn, 'dev') else: dev_data_loader = None train(args, model, train_data_loader, dev_data_loader, metric, rank) if args.do_test: if rank == 0: test_data_loader = create_data_loader(args, dataset_class, test_trans_func, batchify_fn, 'test') if args.do_train: # If do_eval=True, use best model to evaluate the test data. # Otherwise, use final model to evaluate the test data. if args.do_eval: args.init_from_ckpt = os.path.join(args.output_dir, 'best') load_ckpt(args, model) else: if not args.init_from_ckpt: raise ValueError('"init_from_ckpt" should be set.') load_ckpt(args, model) print('\nTest begin...') evaluation(args, model, test_data_loader, metric)
def run(self): args = self.args paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Reads label_map. # read B I O的id label_map_path = os.path.join(args.data_path, "predicate2id.json") if not (os.path.exists(label_map_path) and os.path.isfile(label_map_path)): sys.exit( "{} dose not exists or is not a file.".format(label_map_path)) with open(label_map_path, 'r', encoding='utf8') as fp: label_map = json.load(fp) # dict num_classes = (len(label_map.keys()) - 2) * 2 + 2 # 由于object和subject的区别 B标签*2+2 # Loads pretrained model BERT model = BertForTokenClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes) model = paddle.DataParallel(model) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) criterion = BCELossForDuIE() # Loads dataset. train_dataset = DuIEDataset.from_file( os.path.join(args.data_path, 'train_data.json'), tokenizer, args.max_seq_length, True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) collator = DataCollator() train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=collator, return_list=True) eval_file_path = os.path.join(args.data_path, 'dev_data.json') test_dataset = DuIEDataset.from_file(eval_file_path, tokenizer, args.max_seq_length, True) test_batch_sampler = paddle.io.BatchSampler(test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) test_data_loader = DataLoader(dataset=test_dataset, batch_sampler=test_batch_sampler, collate_fn=collator, return_list=True) # Defines learning rate strategy. steps_by_epoch = len(train_data_loader) num_training_steps = steps_by_epoch * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_ratio) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) # Starts training. global_step = 0 logging_steps = 50 save_steps = 10000 tic_train = time.time() for epoch in range(args.num_train_epochs): print("\n=====start training of %d epochs=====" % epoch) tic_epoch = time.time() model.train() for step, batch in enumerate(train_data_loader): input_ids, seq_lens, tok_to_orig_start_index, tok_to_orig_end_index, labels = batch logits = model(input_ids=input_ids) mask = (input_ids != 0).logical_and( (input_ids != 1)).logical_and((input_ids != 2)) loss = criterion(logits, labels, mask) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() loss_item = loss.numpy().item() global_step += 1 if global_step % logging_steps == 0 and rank == 0: print( "epoch: %d / %d, steps: %d / %d, loss: %f, speed: %.2f step/s" % (epoch, args.num_train_epochs, step, steps_by_epoch, loss_item, logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % save_steps == 0 and rank == 0: print("\n=====start evaluating ckpt of %d steps=====" % global_step) precision, recall, f1 = self.evaluate( model, criterion, test_data_loader, eval_file_path, "eval") print("precision: %.2f\t recall: %.2f\t f1: %.2f\t" % (100 * precision, 100 * recall, 100 * f1)) print("saving checkpoing model_%d.pdparams to %s " % (global_step, args.output_dir)) paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) model.train() # back to train mode tic_epoch = time.time() - tic_epoch print("epoch time footprint: %d hour %d min %d sec" % (tic_epoch // 3600, (tic_epoch % 3600) // 60, tic_epoch % 60)) # Does final evaluation. if rank == 0: print("\n=====start evaluating last ckpt of %d steps=====" % global_step) precision, recall, f1 = self.evaluate(model, criterion, test_data_loader, eval_file_path, "eval") print("precision: %.2f\t recall: %.2f\t f1: %.2f\t" % (100 * precision, 100 * recall, 100 * f1)) paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) print("\n=====training complete=====")
def do_predict(args): paddle.set_device("gpu" if args.use_gpu else "cpu") train_ds, predict_ds = load_dataset('msra_ner', splits=('train', 'test'), lazy=False) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) ignore_label = -100 batchify_fn = lambda samples, fn=Dict( { 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id ), # segment 'seq_len': Stack(), 'labels': Pad(axis=0, pad_val=ignore_label) # label }): fn(samples) raw_data = predict_ds.data id2label = dict(enumerate(predict_ds.label_list)) predict_ds = predict_ds.map(trans_func) predict_data_loader = DataLoader(dataset=predict_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) model.eval() pred_list = [] len_list = [] for step, batch in enumerate(predict_data_loader): input_ids, token_type_ids, length, labels = batch logits = model(input_ids, token_type_ids) pred = paddle.argmax(logits, axis=-1) pred_list.append(pred.numpy()) len_list.append(length.numpy()) preds = parse_decodes(raw_data, id2label, pred_list, len_list) file_path = "results.txt" with open(file_path, "w", encoding="utf8") as fout: fout.write("\n".join(preds)) # Print some examples print( "The results have been saved in the file: %s, some examples are shown below: " % file_path) print("\n".join(preds[:10]))
def do_predict(args): paddle.set_device("gpu" if args.use_gpu else "cpu") train_dataset, predict_dataset = ppnlp.datasets.MSRA_NER.get_datasets( ["train", "test"]) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_dataset.get_labels() label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(convert_example, tokenizer=tokenizer, label_list=label_list, no_entity_id=label_num - 1, max_seq_length=args.max_seq_length) ignore_label = -100 batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment Stack(), # length Pad(axis=0, pad_val=ignore_label) # label ): fn(samples) raw_data = predict_dataset.data id2label = dict(enumerate(predict_dataset.get_labels())) predict_dataset = predict_dataset.apply(trans_func, lazy=True) predict_batch_sampler = paddle.io.BatchSampler(predict_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) predict_data_loader = DataLoader(dataset=predict_dataset, batch_sampler=predict_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) model.eval() pred_list = [] len_list = [] for step, batch in enumerate(predict_data_loader): input_ids, segment_ids, length, labels = batch logits = model(input_ids, segment_ids) pred = paddle.argmax(logits, axis=-1) pred_list.append(pred.numpy()) len_list.append(length.numpy()) preds = parse_decodes(raw_data, id2label, pred_list, len_list) file_path = "results.txt" with open(file_path, "w", encoding="utf8") as fout: fout.write("\n".join(preds)) # Print some examples print( "The results have been saved in the file: %s, some examples are shown below: " % file_path) print("\n".join(preds[:10]))
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Create dataset, tokenizer and dataloader. train_ds, test_ds = load_dataset('msra_ner', splits=('train', 'test'), lazy=False) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) train_ds = train_ds.map(trans_func) ignore_label = -100 batchify_fn = lambda samples, fn=Dict( { 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id ), # segment 'seq_len': Stack(), # seq_len 'labels': Pad(axis=0, pad_val=ignore_label) # label }): fn(samples) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_data_loader = DataLoader(dataset=train_ds, collate_fn=batchify_fn, num_workers=0, batch_sampler=train_batch_sampler, return_list=True) test_ds = test_ds.map(trans_func) test_data_loader = DataLoader(dataset=test_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) global_step = 0 last_step = args.num_train_epochs * len(train_data_loader) tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, _, labels = batch logits = model(input_ids, token_type_ids) loss = loss_fct(logits, labels) avg_loss = paddle.mean(loss) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, avg_loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() avg_loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == last_step: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: evaluate(model, loss_fct, metric, test_data_loader, label_num) paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step))
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() train_dataset, dev_dataset = ppnlp.datasets.MSRA_NER.get_datasets( ["train", "dev"]) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_dataset.get_labels() label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(convert_example, tokenizer=tokenizer, label_list=label_list, no_entity_id=label_num - 1, max_seq_length=args.max_seq_length) train_dataset = train_dataset.apply(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) ignore_label = -100 batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment Stack(), # length Pad(axis=0, pad_val=ignore_label) # label ): fn(samples) train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) dev_data_loader = DataLoader(dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) lr_scheduler = paddle.optimizer.lr.LambdaDecay( args.learning_rate, lambda current_step, num_warmup_steps=args.warmup_steps, num_training_steps=args.max_steps if args.max_steps > 0 else (len(train_data_loader) * args.num_train_epochs): float( current_step) / float(max(1, num_warmup_steps)) if current_step < num_warmup_steps else max( 0.0, float(num_training_steps - current_step) / float( max(1, num_training_steps - num_warmup_steps)))) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(int(math.ceil((label_num + 1) / 2.0)), "IOB") global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): input_ids, segment_ids, length, labels = batch logits = model(input_ids, segment_ids) loss = loss_fct(logits.reshape([-1, label_num]), labels.reshape([-1])) avg_loss = paddle.mean(loss) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, avg_loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() avg_loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.save_steps == 0: evaluate(model, loss_fct, metric, dev_data_loader, label_num) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) global_step += 1
def do_eval(args): paddle.set_device(args.device) # Create dataset, tokenizer and dataloader. train_ds, eval_ds = load_dataset('msra_ner', split=('train', 'test')) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.features['ner_tags'].feature.names label_num = len(label_list) no_entity_id = 0 def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples['tokens'], max_seq_len=args.max_seq_length, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, return_length=True) labels = [] for i, label in enumerate(examples['ner_tags']): label_ids = label if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids): label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) - 2] label_ids = [no_entity_id] + label_ids + [no_entity_id] label_ids += [no_entity_id] * ( len(tokenized_inputs['input_ids'][i]) - len(label_ids)) labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs ignore_label = -100 batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32' ), # segment 'seq_len': Stack(dtype='int64'), 'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64') # label }): fn(samples) eval_ds = eval_ds.select(range(len(eval_ds) - 1)) eval_ds = eval_ds.map(tokenize_and_align_labels, batched=True) eval_data_loader = DataLoader(dataset=eval_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) model.eval() metric.reset() for step, batch in enumerate(eval_data_loader): input_ids, token_type_ids, length, labels = batch logits = model(input_ids, token_type_ids) loss = loss_fct(logits, labels) avg_loss = paddle.mean(loss) preds = logits.argmax(axis=2) num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( length, preds, labels) metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) precision, recall, f1_score = metric.accumulate() print("eval loss: %f, precision: %f, recall: %f, f1: %f" % (avg_loss, precision, recall, f1_score))
def do_predict(args): paddle.set_device(args.device) # Create dataset, tokenizer and dataloader. train_examples, predict_examples = load_dataset('msra_ner', split=('train', 'test')) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_examples.features['ner_tags'].feature.names label_num = len(label_list) no_entity_id = 0 def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples['tokens'], max_seq_len=args.max_seq_length, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, return_length=True) labels = [] for i, label in enumerate(examples['ner_tags']): label_ids = label if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids): label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) - 2] label_ids = [no_entity_id] + label_ids + [no_entity_id] label_ids += [no_entity_id] * ( len(tokenized_inputs['input_ids'][i]) - len(label_ids)) labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs ignore_label = -100 batchify_fn = lambda samples, fn=Dict( { 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id ), # segment 'seq_len': Stack(), 'labels': Pad(axis=0, pad_val=ignore_label) # label }): fn(samples) id2label = dict(enumerate(label_list)) predict_examples = predict_examples.select( range(len(predict_examples) - 1)) predict_ds = predict_examples.map(tokenize_and_align_labels, batched=True) predict_data_loader = DataLoader(dataset=predict_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) model.eval() pred_list = [] len_list = [] for step, batch in enumerate(predict_data_loader): input_ids, token_type_ids, length, labels = batch logits = model(input_ids, token_type_ids) pred = paddle.argmax(logits, axis=-1) pred_list.append(pred.numpy()) len_list.append(length.numpy()) preds = parse_decodes(predict_examples, id2label, pred_list, len_list) file_path = "results.txt" with open(file_path, "w", encoding="utf8") as fout: fout.write("\n".join(preds)) # Print some examples print( "The results have been saved in the file: %s, some examples are shown below: " % file_path) print("\n".join(preds[:10]))
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() train_dataset, test_dataset = load_dataset( 'msra_ner', splits=('train', 'test'), lazy=True) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_dataset.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial( tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) train_dataset = train_dataset.map(trans_func) train_dataset = train_dataset.shard() ignore_label = -100 batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input 'segment_ids': Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment 'seq_len': Stack(), 'labels': Pad(axis=0, pad_val=ignore_label) # label }): fn(samples) train_data_loader = DataLoader( dataset=train_dataset, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) test_dataset = test_dataset.map(trans_func) test_data_loader = DataLoader( dataset=test_dataset, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) model = BertForTokenClassification.from_pretrained( args.model_name_or_path, num_classes=label_num) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = 2812 lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, _, labels = batch logits = model(input_ids, segment_ids) loss = loss_fct( logits.reshape([-1, label_num]), labels.reshape([-1])) avg_loss = paddle.mean(loss) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, avg_loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() avg_loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.save_steps == 0: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: evaluate(model, loss_fct, metric, test_data_loader, label_num) paddle.save(model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) # Save final model if (global_step) % args.save_steps != 0: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: evaluate(model, loss_fct, metric, test_data_loader, label_num) paddle.save(model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step))