def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_ds = load_dataset('glue', args.task_name, splits="train") tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=args.max_seq_length) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64" if train_ds.label_list else "float32") # label ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) if args.task_name == "mnli": dev_ds_matched, dev_ds_mismatched = load_dataset( 'glue', args.task_name, splits=["dev_matched", "dev_mismatched"]) dev_ds_matched = dev_ds_matched.map(trans_func, lazy=True) dev_ds_mismatched = dev_ds_mismatched.map(trans_func, lazy=True) dev_batch_sampler_matched = paddle.io.BatchSampler( dev_ds_matched, batch_size=args.batch_size, shuffle=False) dev_data_loader_matched = DataLoader( dataset=dev_ds_matched, batch_sampler=dev_batch_sampler_matched, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_batch_sampler_mismatched = paddle.io.BatchSampler( dev_ds_mismatched, batch_size=args.batch_size, shuffle=False) dev_data_loader_mismatched = DataLoader( dataset=dev_ds_mismatched, batch_sampler=dev_batch_sampler_mismatched, collate_fn=batchify_fn, num_workers=0, return_list=True) else: dev_ds = load_dataset('glue', args.task_name, splits='dev') dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_labels = 1 if train_ds.label_list == None else len(train_ds.label_list) model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_labels) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) # Step1: Initialize a dictionary to save the weights from the origin BERT model. origin_weights = {} for name, param in model.named_parameters(): origin_weights[name] = param # Step2: Convert origin model to supernet. sp_config = supernet(expand_ratio=args.width_mult_list) model = Convert(sp_config).convert(model) # Use weights saved in the dictionary to initialize supernet. utils.set_state_dict(model, origin_weights) del origin_weights # Step3: Define teacher model. teacher_model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_labels) # Step4: Config about distillation. mapping_layers = ['bert.embeddings'] for idx in range(model.bert.config['num_hidden_layers']): mapping_layers.append('bert.encoder.layers.{}'.format(idx)) default_distill_config = { 'lambda_distill': 0.1, 'teacher_model': teacher_model, 'mapping_layers': mapping_layers, } distill_config = DistillConfig(**default_distill_config) # Step5: Config in supernet training. ofa_model = OFA(model, distill_config=distill_config, elastic_order=['width']) criterion = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() metric = metric_class() if args.task_name == "mnli": dev_data_loader = (dev_data_loader_matched, dev_data_loader_mismatched) # Step6: Calculate the importance of neurons and head, # and then reorder them according to the importance. head_importance, neuron_importance = nlp_utils.compute_neuron_head_importance( args.task_name, ofa_model.model, dev_data_loader, loss_fct=criterion, num_layers=model.bert.config['num_hidden_layers'], num_heads=model.bert.config['num_attention_heads']) reorder_neuron_head(ofa_model.model, head_importance, neuron_importance) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=ofa_model.model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): # Step7: Set current epoch and task. ofa_model.set_epoch(epoch) ofa_model.set_task('width') for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, labels = batch for width_mult in args.width_mult_list: # Step8: Broadcast supernet config from width_mult, # and use this config in supernet training. net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) logits, teacher_logits = ofa_model(input_ids, segment_ids, attention_mask=[None, None]) rep_loss = ofa_model.calc_distill_loss() if args.task_name == 'sts-b': logit_loss = 0.0 else: logit_loss = soft_cross_entropy(logits, teacher_logits.detach()) loss = rep_loss + args.lambda_logit * logit_loss loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0: if args.task_name == "mnli": evaluate(teacher_model, criterion, metric, dev_data_loader_matched, width_mult=100) evaluate(teacher_model, criterion, metric, dev_data_loader_mismatched, width_mult=100) else: evaluate(teacher_model, criterion, metric, dev_data_loader, width_mult=100) for idx, width_mult in enumerate(args.width_mult_list): net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) tic_eval = time.time() if args.task_name == "mnli": acc = evaluate(ofa_model, criterion, metric, dev_data_loader_matched, width_mult) evaluate(ofa_model, criterion, metric, dev_data_loader_mismatched, width_mult) print("eval done total : %s s" % (time.time() - tic_eval)) else: acc = evaluate(ofa_model, criterion, metric, dev_data_loader, width_mult) print("eval done total : %s s" % (time.time() - tic_eval)) if (not args.n_gpu > 1 ) or paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
def run(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() rank = paddle.distributed.get_rank() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) set_seed(args) if rank == 0: if os.path.exists(args.model_name_or_path): print("init checkpoint from %s" % args.model_name_or_path) model = model_class.from_pretrained(args.model_name_or_path) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) if args.do_train: if args.train_file: train_ds = load_dataset('sqaud', data_files=args.train_file) elif args.version_2_with_negative: train_ds = load_dataset('squad', splits='train_v2') else: train_ds = load_dataset('squad', splits='train_v1') train_ds.map(partial(prepare_train_features, tokenizer=tokenizer, args=args), batched=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64") }): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = CrossEntropyLossForSQuAD() global_step = 0 tic_train = time.time() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, start_positions, end_positions = batch logits = model(input_ids=input_ids, token_type_ids=token_type_ids) loss = criterion(logits, (start_positions, end_positions)) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch + 1, step + 1, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == num_training_steps: if rank == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) if global_step == num_training_steps: break if args.do_predict and rank == 0: if args.predict_file: dev_ds = load_dataset('sqaud', data_files=args.predict_file) elif args.version_2_with_negative: dev_ds = load_dataset('squad', splits='dev_v2') else: dev_ds = load_dataset('squad', splits='dev_v1') dev_ds.map(partial(prepare_validation_features, tokenizer=tokenizer, args=args), batched=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) evaluate(model, dev_data_loader, args)
['train', 'dev', 'test']) # Constructs the newtork. label_list = train_ds.get_labels() model = ppnlp.models.Senta(network=args.network, vocab_size=len(vocab), num_classes=len(label_list)) model = paddle.Model(model) # Reads data and generates mini-batches. trans_fn = partial(convert_example, vocab=vocab, unk_token_id=vocab.get('[UNK]', 1), is_test=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=vocab['[PAD]']), # input_ids Stack(dtype="int64"), # seq len Stack(dtype="int64") # label ): [data for data in fn(samples)] train_loader = create_dataloader(train_ds, trans_fn=trans_fn, batch_size=args.batch_size, mode='train', use_gpu=args.use_gpu, pad_token_id=vocab.get('[PAD]', 0), batchify_fn=batchify_fn) dev_loader = create_dataloader(dev_ds, trans_fn=trans_fn, batch_size=args.batch_size, mode='validation', use_gpu=args.use_gpu,
no_entity_label_idx = label_map.get("O", 2) set_seed(args.seed) skep = SkepModel.from_pretrained('skep_ernie_1.0_large_ch') model = SkepCrfForTokenClassification( skep, num_classes=len(train_ds.label_list)) tokenizer = SkepTokenizer.from_pretrained('skep_ernie_1.0_large_ch') trans_func = partial( convert_example_to_feature, tokenizer=tokenizer, max_seq_len=args.max_seq_length, no_entity_label=no_entity_label_idx, is_test=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input ids Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # token type ids Stack(dtype='int64'), # sequence lens Pad(axis=0, pad_val=no_entity_label_idx) # labels ): [data for data in fn(samples)] train_data_loader = create_dataloader( train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict)
def create_data_loader(args, tokenizer): train_ds = load_dataset('glue', args.task_name, splits="train") trans_func = partial( convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=args.max_seq_length, pad_to_max_seq_len=args.pad_to_max_seq_len, ) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, pad_right=False), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, pad_right=False ), # token_type Pad(axis=0, pad_val=0, pad_right=False), # attention_mask Stack(dtype="int64" if train_ds.label_list else "float32"), # label ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) if args.task_name == "mnli": dev_ds_matched, dev_ds_mismatched = load_dataset( 'glue', args.task_name, splits=["dev_matched", "dev_mismatched"]) dev_ds_matched = dev_ds_matched.map(trans_func, lazy=True) dev_ds_mismatched = dev_ds_mismatched.map(trans_func, lazy=True) dev_batch_sampler_matched = paddle.io.BatchSampler( dev_ds_matched, batch_size=args.batch_size, shuffle=False) dev_data_loader_matched = DataLoader( dataset=dev_ds_matched, batch_sampler=dev_batch_sampler_matched, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_batch_sampler_mismatched = paddle.io.BatchSampler( dev_ds_mismatched, batch_size=args.batch_size, shuffle=False) dev_data_loader_mismatched = DataLoader( dataset=dev_ds_mismatched, batch_sampler=dev_batch_sampler_mismatched, collate_fn=batchify_fn, num_workers=0, return_list=True) return train_data_loader, dev_data_loader_matched, dev_data_loader_mismatched, train_ds, dev_ds_matched, dev_ds_mismatched else: dev_ds = load_dataset('glue', args.task_name, splits='dev') dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) return train_data_loader, dev_data_loader, train_ds, dev_ds
def do_train(args): # Set the paddle execute enviroment paddle.enable_static() place = paddle.set_device(args.select_device) set_seed(args) # Create the main_program for the training and dev_program for the validation main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() dev_program = paddle.static.Program() # Get the configuration of tokenizer and model args.task_name = args.task_name.lower() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] dataset_class, metric_class = TASK_CLASSES[args.task_name] # Create the tokenizer and dataset tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) train_dataset = dataset_class.get_datasets(["train"]) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_dataset.get_labels(), max_seq_length=args.max_seq_length) train_dataset = train_dataset.apply(trans_func, lazy=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment Stack(dtype="int64" if train_dataset.get_labels() else "float32") # label ): [data for i, data in enumerate(fn(samples))] train_batch_sampler = paddle.io.BatchSampler(train_dataset, batch_size=args.batch_size, shuffle=True) feed_list_name = [] # Define the input data and create the train/dev data_loader with paddle.static.program_guard(main_program, startup_program): [input_ids, segment_ids, labels] = create_data_holder(args.task_name) train_data_loader = DataLoader(dataset=train_dataset, feed_list=[input_ids, segment_ids, labels], batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=False) if args.task_name == "mnli": dev_dataset_matched, dev_dataset_mismatched = dataset_class.get_datasets( ["dev_matched", "dev_mismatched"]) dev_dataset_matched = dev_dataset_matched.apply(trans_func, lazy=True) dev_dataset_mismatched = dev_dataset_mismatched.apply(trans_func, lazy=True) dev_batch_sampler_matched = paddle.io.BatchSampler( dev_dataset_matched, batch_size=args.batch_size, shuffle=False) dev_data_loader_matched = DataLoader( dataset=dev_dataset_matched, batch_sampler=dev_batch_sampler_matched, feed_list=[input_ids, segment_ids, labels], collate_fn=batchify_fn, num_workers=0, return_list=False) dev_batch_sampler_mismatched = paddle.io.BatchSampler( dev_dataset_mismatched, batch_size=args.batch_size, shuffle=False) dev_data_loader_mismatched = DataLoader( dataset=dev_dataset_mismatched, feed_list=[input_ids, segment_ids, labels], batch_sampler=dev_batch_sampler_mismatched, collate_fn=batchify_fn, num_workers=0, return_list=False) else: dev_dataset = dataset_class.get_datasets(["dev"]) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader( dataset=dev_dataset, feed_list=[input_ids, segment_ids, labels], batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=False) # Create the training-forward program, and clone it for the validation with paddle.static.program_guard(main_program, startup_program): num_class = 1 if train_dataset.get_labels() is None else len( train_dataset.get_labels()) model, pretrained_state_dict = model_class.from_pretrained( args.model_name_or_path, num_classes=num_class) loss_fct = paddle.nn.loss.CrossEntropyLoss( ) if train_dataset.get_labels() else paddle.nn.loss.MSELoss() logits = model(input_ids, segment_ids) loss = loss_fct(logits, labels) dev_program = main_program.clone(for_test=True) # Create the training-backward program, this pass will not be # executed in the validation num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs with paddle.static.program_guard(main_program, startup_program): lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) optimizer.minimize(loss) # Create the metric pass for the validation with paddle.static.program_guard(dev_program, startup_program): metric = metric_class() correct = metric.compute(logits, labels) # Initialize the fine-tuning parameter, we will load the parameters in # pre-training model. And initialize the parameter which not in pre-training model # by the normal distribution. exe = paddle.static.Executor(place) exe.run(startup_program) state_dict = model.state_dict() reset_state_dict = reset_program_state_dict(args, model, state_dict, pretrained_state_dict) paddle.static.set_program_state(main_program, reset_state_dict) global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 loss_return = exe.run(main_program, feed=batch, fetch_list=[loss]) if global_step % args.logging_steps == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss_return[0], args.logging_steps / (time.time() - tic_train))) tic_train = time.time() lr_scheduler.step() if global_step % args.save_steps == 0: # Validation pass, record the loss and metric if args.task_name == "mnli": evaluate(exe, metric, loss, correct, dev_program, dev_data_loader_matched) evaluate(exe, metric, loss, correct, dev_program, dev_data_loader_mismatched) else: evaluate(exe, metric, loss, correct, dev_program, dev_data_loader) output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) paddle.fluid.io.save_params(exe, output_dir) tokenizer.save_pretrained(output_dir)
def train(args): # 加载数据集 train_ds, dev_ds, test_ds = load_dataset('dureader_robust', splits=('train', 'dev', 'test')) tokenizer = paddlenlp.transformers.ErnieTokenizer.from_pretrained(args.model_name) train_trans_func = partial(prepare_train_features, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, tokenizer=tokenizer) train_ds.map(train_trans_func, batched=True, num_workers=4) dev_trans_func = partial(prepare_validation_features, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, tokenizer=tokenizer) dev_ds.map(dev_trans_func, batched=True, num_workers=4) test_ds.map(dev_trans_func, batched=True, num_workers=4) # 定义BatchSampler train_batch_sampler = paddle.io.DistributedBatchSampler(train_ds, batch_size=args.batch_size, shuffle=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False) # 定义batchify_fn train_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64") }): fn(samples) dev_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) # 构造DataLoader train_data_loader = paddle.io.DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) dev_data_loader = paddle.io.DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) test_data_loader = paddle.io.DataLoader( dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) # 训练配置相关 num_training_steps = len(train_data_loader) * args.epochs use_gpu = True if paddle.get_device().startswith("gpu") else False if use_gpu: paddle.set_device('gpu:0') lr_scheduler = paddlenlp.transformers.LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) model = ErnieForQuestionAnswering.from_pretrained(args.model_name) decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) # 训练代码 model.train() criterion = CrossEntropyLossForRobust() global_step = 0 for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): global_step += 1 input_ids, segment_ids, start_positions, end_positions = batch logits = model(input_ids=input_ids, token_type_ids=segment_ids) loss = criterion(logits, (start_positions, end_positions)) if global_step % 100 == 0 : print("global step %d, epoch: %d, batch: %d, loss: %.5f" % (global_step, epoch, step, loss)) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() paddle.save(model.state_dict(), args.save_model_path) paddle.save(model.state_dict(), args.save_opt_path) evaluate(model=model, data_loader=dev_data_loader)
def _get_predictions(self, dicts): """ Feed a preprocessed dataset to the model and get the actual predictions (forward pass + formatting). :param dicts: list of dictionaries examples:[{'query': "where is florida?"}, {'query': "who wrote lord of the rings?"}, ...] [{'passages': [{ "title": 'Big Little Lies (TV series)', "text": 'series garnered several accolades. It received..', "label": 'positive', "external_id": '18768923'}, {"title": 'Framlingham Castle', "text": 'Castle on the Hill "Castle on the Hill" is a song by English..', "label": 'positive', "external_id": '19930582'}, ...] :return: dictionary of embeddings for "passages" and "query" """ dataset, tensor_names, _, baskets = self.processor.dataset_from_dicts( dicts, indices=[i for i in range(len(dicts))], return_baskets=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=self.passage_tokenizer.pad_token_id), # input_ids Pad(axis=0, pad_val=self.passage_tokenizer.pad_token_type_id), # token_type_ids ): [data for data in fn(samples)] batch_sampler = paddle.io.BatchSampler( dataset, batch_size=self.batch_size, shuffle=False) data_loader = paddle.io.DataLoader( dataset=dataset, batch_sampler=batch_sampler, collate_fn=batchify_fn, return_list=True) all_embeddings = {"query": [], "passages": []} # Todo(tianxin04): ErnieDualEncoder subclass nn.Module, self.ernie_dual_encoder.eval() # When running evaluations etc., we don't want a progress bar for every single query if len(dataset) == 1: disable_tqdm = True else: disable_tqdm = not self.progress_bar with tqdm( total=len(data_loader) * self.batch_size, unit=" Docs", desc=f"Create embeddings", position=1, leave=False, disable=disable_tqdm, ) as progress_bar: for batch in data_loader: input_ids, token_type_ids = batch #input_ids, token_type_ids, label_ids = batch with paddle.no_grad(): cls_embeddings = self.ernie_dual_encoder.get_pooled_embedding( input_ids=input_ids, token_type_ids=token_type_ids) if "query" in dicts[0]: all_embeddings["query"].append(cls_embeddings.cpu() .numpy()) if "passages" in dicts[0]: all_embeddings["passages"].append(cls_embeddings.cpu() .numpy()) progress_bar.update(self.batch_size) if all_embeddings["passages"]: all_embeddings["passages"] = np.concatenate(all_embeddings[ "passages"]) if all_embeddings["query"]: all_embeddings["query"] = np.concatenate(all_embeddings["query"]) return all_embeddings
def train(args): # 加载数据 trainset = IMDBDataset(is_training=True) testset = IMDBDataset(is_training=False) # 封装成MapDataSet的形式 train_ds = MapDataset(trainset, label_list=[0, 1]) test_ds = MapDataset(testset, label_list=[0, 1]) # 定义XLNet的Tokenizer tokenizer = XLNetTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=args.max_seq_length) # 构造train_data_loader 和 dev_data_loader train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, pad_right=False), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, pad_right=False ), # token_type Pad(axis=0, pad_val=0, pad_right=False), # attention_mask Stack(dtype="int64" if train_ds.label_list else "float32"), # label ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_ds = MapDataset(testset) dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) # 训练配置 # 固定随机种子 set_seed(args) # 设定运行环境 use_gpu = True if paddle.get_device().startswith("gpu") else False if use_gpu: paddle.set_device('gpu:0') num_classes = len(train_ds.label_list) model = XLNetForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes) #paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = paddle.DataParallel(model) # 设定lr_scheduler if args.max_steps > 0: num_training_steps = args.max_steps num_train_epochs = ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len(train_data_loader) * args.num_train_epochs num_train_epochs = args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # 制定优化器 clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=args.max_grad_norm) decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "layer_norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), grad_clip=clip, weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) # 模型训练 metric = Accuracy() # 定义损失函数 loss_fct = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() global_step = 0 tic_train = time.time() model.train() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, attention_mask, labels = batch logits = model(input_ids, token_type_ids, attention_mask) loss = loss_fct(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: tic_eval = time.time() evaluate(model, loss_fct, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) if (not paddle.distributed.get_world_size() > 1 ) or paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "%s_ft_model_%d" % (args.task_name, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if global_step == num_training_steps: exit(0) tic_train += time.time() - tic_eval
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Create dataset, tokenizer and dataloader. if args.dataset == "peoples_daily_ner": train_ds, dev_ds, test_ds = load_dataset( args.dataset, splits=('train', 'dev', 'test'), lazy=False) else: train_ds, test_ds = load_dataset( args.dataset, splits=('train', 'test'), lazy=False) AutoForTokenClassification, AutoTokenizer = MODEL_CLASSES[args.model_type] tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial( tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) train_ds = train_ds.map(trans_func) ignore_label = -100 batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32'), # segment 'seq_len': Stack(dtype='int64'), # seq_len 'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64') # label }): fn(samples) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_data_loader = DataLoader( dataset=train_ds, collate_fn=batchify_fn, num_workers=0, batch_sampler=train_batch_sampler, return_list=True) test_ds = test_ds.map(trans_func) test_data_loader = DataLoader( dataset=test_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) if args.dataset == "peoples_daily_ner": dev_ds = dev_ds.map(trans_func) dev_data_loader = DataLoader( dataset=dev_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = AutoForTokenClassification.from_pretrained( args.model_name_or_path, num_classes=label_num) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) global_step = 0 last_step = args.num_train_epochs * len(train_data_loader) tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, _, labels = batch logits = model(input_ids, token_type_ids) loss = loss_fct(logits, labels) avg_loss = paddle.mean(loss) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, avg_loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() avg_loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == num_training_steps: if paddle.distributed.get_rank() == 0: if args.dataset == "peoples_daily_ner": evaluate(model, loss_fct, metric, dev_data_loader, label_num, "valid") evaluate(model, loss_fct, metric, test_data_loader, label_num, "test") paddle.save(model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) if global_step >= num_training_steps: return
def train(): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) train_dataset, dev_dataset = Poetry.get_datasets(['train', 'dev']) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example( tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len, noise_prob=args.noise_prob, use_random_noice=args.use_random_noice) train_dataset = train_dataset.apply(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) train_data_loader = DataLoader( dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler( dev_dataset, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader( dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) label_num = model.word_emb.weight.shape[0] train_model = StackModel(model) if paddle.distributed.get_world_size() > 1: # All 'forward' outputs derived from the module parameters using in DataParallel # must participate in the calculation of losses and subsequent gradient calculations. # So we use StackModel here to make the model only output loss in its 'forward' function. train_model = paddle.DataParallel(train_model) max_steps = len(train_data_loader) * args.num_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_steps, args.warmup_proportion) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=nn.ClipGradByGlobalNorm(1.0), apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) rouge1 = Rouge1() rouge2 = Rouge2() global_step = 1 tic_train = time.time() for epoch in range(args.num_epochs): for step, batch in enumerate(train_data_loader, start=1): (src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, _) = batch # import pdb; pdb.set_trace() if args.label_smooth > 0.: tgt_labels = nn.functional.label_smooth( nn.functional.one_hot(tgt_labels, label_num), epsilon=args.label_smooth) tgt_pos = paddle.nonzero(attn_ids == attn_id) loss = train_model(src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, tgt_pos) if global_step % args.logging_steps == 0: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, lr: %.3e" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train), lr_scheduler.get_lr())) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 and ( (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0): evaluate(model, dev_data_loader, tokenizer, rouge1, rouge2, attn_id, tgt_type_id, args) output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) global_step += 1
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() task_name = args.task_name.lower() dataset_class = TASK_CLASSES[task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) root = args.data_path set_seed(args) train_ds = dataset_class(tokenizer=tokenizer, root=root, doc_stride=args.doc_stride, max_query_length=args.max_query_length, max_seq_length=args.max_seq_length, mode='train') train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment Stack(), # unipue_id Stack(dtype="int64"), # start_pos Stack(dtype="int64") # end_pos ): [data for i, data in enumerate(fn(samples)) if i != 2] train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) dev_ds = dataset_class(tokenizer=tokenizer, root=root, doc_stride=args.doc_stride, max_query_length=args.max_query_length, max_seq_length=args.max_seq_length, mode='dev') dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment Stack() # unipue_id ): fn(samples) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) model = model_class.from_pretrained(args.model_name_or_path) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) lr_scheduler = paddle.optimizer.lr.LambdaDecay( args.learning_rate, lambda current_step, warmup_proportion=args.warmup_proportion, num_training_steps=args.max_steps if args.max_steps > 0 else (len(train_ds.examples) // args.batch_size * args.num_train_epochs ): float(current_step) / float( max(1, warmup_proportion * num_training_steps)) if current_step < warmup_proportion * num_training_steps else max( 0.0, float(num_training_steps - current_step) / float( max( 1, num_training_steps - warmup_proportion * num_training_steps)))) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) criterion = CrossEntropyLossForSQuAD() global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, start_positions, end_positions = batch logits = model(input_ids=input_ids, token_type_ids=segment_ids) loss = criterion(logits, (start_positions, end_positions)) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.save_steps == 0: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: evaluate(model, dev_data_loader, args, tokenizer)
def do_train(): set_seed(args.seed) paddle.set_device("gpu" if args.n_gpu else "cpu") world_size = paddle.distributed.get_world_size() if world_size > 1: paddle.distributed.init_parallel_env() train_dataset, dev_dataset, test_dataset = ppnlp.datasets.ChnSentiCorp.get_datasets( ['train', 'dev', 'test']) # If you wanna use bert/roberta/electra pretrained model, # model = ppnlp.transformers.BertForSequenceClassification.from_pretrained('bert-base-chinese', num_class=2) # model = ppnlp.transformers.RobertaForSequenceClassification.from_pretrained('roberta-wwm-ext', num_class=2) # model = ppnlp.transformers.ElectraForSequenceClassification.from_pretrained('chinese-electra-small', num_classes=2) model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained( 'ernie-tiny', num_classes=len(train_dataset.get_labels())) # If you wanna use bert/roberta/electra pretrained model, # tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese') # tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext') # tokenizer = ppnlp.transformers.ElectraTokenizer.from_pretrained('chinese-electra-small', num_classes=2) # ErnieTinyTokenizer is special for ernie-tiny pretained model. tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained( 'ernie-tiny') trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_dataset.get_labels(), max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment Stack(dtype="int64") # label ): [data for data in fn(samples)] train_data_loader = create_dataloader(train_dataset, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader(dev_dataset, mode='dev', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) test_data_loader = create_dataloader(test_dataset, mode='test', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs num_warmup_steps = int(args.warmup_proportion * num_training_steps) def get_lr_factor(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) else: return max( 0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))) lr_scheduler = paddle.optimizer.lr.LambdaDecay( args.learning_rate, lr_lambda=lambda current_step: get_lr_factor(current_step)) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) criterion = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = criterion(logits, labels) probs = F.softmax(logits, axis=1) correct = metric.compute(probs, labels) metric.update(correct) acc = metric.accumulate() global_step += 1 if global_step % 10 == 0 and paddle.distributed.get_rank() == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, acc, 10 / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % 100 == 0 and paddle.distributed.get_rank() == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) evaluate(model, criterion, metric, dev_data_loader) model._layers.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir) if paddle.distributed.get_rank() == 0: print('Evaluating on test data.') evaluate(model, criterion, metric, test_data_loader)
while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples[i]["end_positions"] = token_end_index + 1 return tokenized_examples train_ds.map(prepare_train_features, lazy=False) print(train_ds[0]) print(train_ds[1]) print(len(train_ds)) print('-----------------------------------------------------') train_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input "segment_ids": Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment "start_positions": Stack(dtype="int64"), # start_pos "end_positions": Stack(dtype="int64") # end_pos }): fn(samples) train_data_loader = DataLoader( dataset=train_ds, batch_size=8, collate_fn=train_batchify_fn, return_list=True) for batch in train_data_loader: print(batch[0]) print(batch[1]) print(batch[2])
def do_train(args): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) train_ds = load_dataset(read_custom_data, filename=os.path.join(args.data_dir, "train.txt"), is_test=False, lazy=False) dev_ds = load_dataset(read_custom_data, filename=os.path.join(args.data_dir, "dev.txt"), is_test=False, lazy=False) tokenizer = ErnieCtmTokenizer.from_pretrained("nptag") model = ErnieCtmNptagModel.from_pretrained("nptag") vocab_size = model.ernie_ctm.config["vocab_size"] trans_func = partial(convert_example, tokenzier=tokenizer, max_seq_len=args.max_seq_len) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64' ), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64' ), # token_type_ids Pad(axis=0, pad_val=-100, dtype='int64'), # labels ): fn(samples) train_data_loader = create_dataloader(train_ds, mode="train", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader(dev_ds, mode="dev", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) logger.info("Total steps: %s" % num_training_steps) metric = NPTagAccuracy() criterion = paddle.nn.CrossEntropyLoss() global_step = 0 for epoch in range(1, args.num_train_epochs + 1): logger.info(f"Epoch {epoch} beginnig") start_time = time.time() for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, labels = batch logits = model(input_ids, token_type_ids) loss = criterion(logits.reshape([-1, vocab_size]), labels.reshape([-1])) loss.backward() optimizer.step() optimizer.clear_grad() lr_scheduler.step() if global_step % args.logging_steps == 0 and rank == 0: end_time = time.time() speed = float(args.logging_steps) / (end_time - start_time) logger.info( "global step %d, epoch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, epoch, loss.numpy().item(), speed)) start_time = time.time() if (global_step % args.save_steps == 0 or global_step == num_training_steps) and rank == 0: output_dir = os.path.join(args.output_dir, "model_%d" % (global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model._layers.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) evaluate(model, metric, criterion, dev_data_loader, vocab_size)
def run(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) set_seed(args) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: if os.path.exists(args.model_name_or_path): print("init checkpoint from %s" % args.model_name_or_path) model = model_class.from_pretrained(args.model_name_or_path) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. #NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead. contexts = [examples[i]['context'] for i in range(len(examples))] questions = [examples[i]['question'] for i in range(len(examples))] tokenized_examples = tokenizer(questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length) # Let's label those examples! for i, tokenized_example in enumerate(tokenized_examples): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_example["input_ids"] cls_index = input_ids.index(tokenizer.cls_token_id) # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offsets = tokenized_example['offset_mapping'] # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_example['token_type_ids'] # One example can give several spans, this is the index of the example containing this span of text. sample_index = tokenized_example['overflow_to_sample'] answers = examples[sample_index]['answers'] answer_starts = examples[sample_index]['answer_starts'] # If no answers are given, set the cls_index as answer. if len(answer_starts) == 0: tokenized_examples[i]["start_positions"] = cls_index tokenized_examples[i]["end_positions"] = cls_index else: # Start/end character index of the answer in the text. start_char = answer_starts[0] end_char = start_char + len(answers[0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != 1: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 2 while sequence_ids[token_end_index] != 1: token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples[i]["start_positions"] = cls_index tokenized_examples[i]["end_positions"] = cls_index else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples[i][ "start_positions"] = token_start_index - 1 while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples[i][ "end_positions"] = token_end_index + 1 return tokenized_examples if args.do_train: if args.train_file: train_ds = load_dataset('sqaud', data_files=args.train_file) elif args.version_2_with_negative: train_ds = load_dataset('squad', splits='train_v2') else: train_ds = load_dataset('squad', splits='train_v1') train_ds.map(prepare_train_features, lazy=False) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64") }): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) criterion = CrossEntropyLossForSQuAD() global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, start_positions, end_positions = batch logits = model(input_ids=input_ids, token_type_ids=token_type_ids) loss = criterion(logits, (start_positions, end_positions)) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.save_steps == 0 or global_step == num_training_steps: if (not args.n_gpu > 1 ) or paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. #NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead. contexts = [examples[i]['context'] for i in range(len(examples))] questions = [examples[i]['question'] for i in range(len(examples))] tokenized_examples = tokenizer(questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length) # For validation, there is no need to compute start and end positions for i, tokenized_example in enumerate(tokenized_examples): # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_example['token_type_ids'] # One example can give several spans, this is the index of the example containing this span of text. sample_index = tokenized_example['overflow_to_sample'] tokenized_examples[i]["example_id"] = examples[sample_index]['id'] # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples[i]["offset_mapping"] = [ (o if sequence_ids[k] == 1 else None) for k, o in enumerate(tokenized_example["offset_mapping"]) ] return tokenized_examples if args.do_predict: if args.predict_file: dev_ds = load_dataset('sqaud', data_files=args.predict_file) elif args.version_2_with_negative: dev_ds = load_dataset('squad', splits='dev_v2') else: dev_ds = load_dataset('squad', splits='dev_v1') dev_ds.map(prepare_validation_features, lazy=False) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: evaluate(model, dev_data_loader, args)
train_ds, dev_ds, test_ds = load_dataset( "lcqmc", splits=["train", "dev", "test"]) else: train_ds, dev_ds, test_ds = load_dataset( "glue", "qqp", splits=["train", "dev", "test"]) # Constructs the newtork. model = SimNet( network=args.network, vocab_size=len(vocab), num_classes=len(train_ds.label_list)) model = paddle.Model(model) # Reads data and generates mini-batches. batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=vocab.token_to_idx.get('[PAD]', 0)), # query_ids Pad(axis=0, pad_val=vocab.token_to_idx.get('[PAD]', 0)), # title_ids Stack(dtype="int64"), # query_seq_lens Stack(dtype="int64"), # title_seq_lens Stack(dtype="int64") # label ): [data for data in fn(samples)] tokenizer = CharTokenizer(vocab, args.language) trans_fn = partial( convert_example, tokenizer=tokenizer, is_test=False, language=args.language) train_loader = create_dataloader( train_ds, trans_fn=trans_fn, batch_size=args.batch_size,
def batchify_fn(data): _batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'), 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'), 'position_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'), 'attention_mask': Pad(axis=0, pad_val=0, dtype='float32'), }): fn(samples) ent_label = [x['ent_label'] for x in data] spo_label = [x['spo_label'] for x in data] input_ids, token_type_ids, position_ids, masks = _batchify_fn(data) batch_size, batch_len = input_ids.shape num_classes = len(train_ds.label_list) # Create one-hot labels. # # For example, # - text: # [CLS], 局, 部, 皮, 肤, 感, 染, 引, 起, 的, 皮, 疹, 等, [SEP] # # - ent_label (obj: `list`): # [(0, 5), (9, 10)] # ['局部皮肤感染', '皮疹'] # # - one_hot_ent_label: # shape (sequence_length, 2) # [[ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], # start index # [ 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0]] # end index # # - spo_label (obj: `list`): # [(0, 23, 9)] # [('局部皮肤感染', '相关(导致)', '皮疹')], where entities # are encoded by their start indexes. # # - one_hot_spo_label: # shape (num_predicate, sequence_length, sequence_length) # [..., # [..., [0, ..., 1, ..., 0], ...], # for predicate '相关(导致)' # ...] # the value at [23, 1, 10] is set as 1 # one_hot_ent_label = np.zeros([batch_size, batch_len, 2], dtype=np.float32) one_hot_spo_label = np.zeros( [batch_size, num_classes, batch_len, batch_len], dtype=np.float32) for idx, ent_idxs in enumerate(ent_label): # Shift index by 1 because input_ids start with [CLS] here. for x, y in ent_idxs: x = x + 1 y = y + 1 if x > 0 and x < batch_len and y < batch_len: one_hot_ent_label[idx, x, 0] = 1 one_hot_ent_label[idx, y, 1] = 1 for idx, spo_idxs in enumerate(spo_label): for s, p, o in spo_idxs: s_id = s[0] + 1 o_id = o[0] + 1 if s_id > 0 and s_id < batch_len and o_id < batch_len: one_hot_spo_label[idx, p, s_id, o_id] = 1 # one_hot_xxx_label are used for loss computation. # xxx_label are used for metric computation. ent_label = [one_hot_ent_label, ent_label] spo_label = [one_hot_spo_label, spo_label] return input_ids, token_type_ids, position_ids, masks, ent_label, spo_label
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) global final_res args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] model_class, tokenizer_class = XLNetForSequenceClassification, XLNetTokenizer train_ds = load_dataset('glue', args.task_name, splits="train") tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial( convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=args.max_seq_length) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, pad_right=False), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, pad_right=False), # token_type Pad(axis=0, pad_val=0, pad_right=False), # attention_mask Stack(dtype="int64" if train_ds.label_list else "float32"), # label ): fn(samples) train_data_loader = DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) if args.task_name == "mnli": dev_ds_matched, dev_ds_mismatched = load_dataset( 'glue', args.task_name, splits=["dev_matched", "dev_mismatched"]) dev_ds_matched = dev_ds_matched.map(trans_func, lazy=True) dev_ds_mismatched = dev_ds_mismatched.map(trans_func, lazy=True) dev_batch_sampler_matched = paddle.io.BatchSampler( dev_ds_matched, batch_size=args.batch_size, shuffle=False) dev_data_loader_matched = DataLoader( dataset=dev_ds_matched, batch_sampler=dev_batch_sampler_matched, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_batch_sampler_mismatched = paddle.io.BatchSampler( dev_ds_mismatched, batch_size=args.batch_size, shuffle=False) dev_data_loader_mismatched = DataLoader( dataset=dev_ds_mismatched, batch_sampler=dev_batch_sampler_mismatched, collate_fn=batchify_fn, num_workers=0, return_list=True) else: dev_ds = load_dataset('glue', args.task_name, splits='dev') dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 1 if train_ds.label_list is None else len(train_ds.label_list) model = XLNetForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) if args.max_steps > 0: num_training_steps = args.max_steps num_train_epochs = ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len(train_data_loader) * args.num_train_epochs num_train_epochs = args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=args.max_grad_norm) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "layer_norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), grad_clip=clip, weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) loss_fct = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() metric = metric_class() global_step = 0 tic_train = time.time() model.train() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, attention_mask, labels = batch logits = model(input_ids, token_type_ids, attention_mask) loss = loss_fct(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: tic_eval = time.time() if args.task_name == "mnli": print("matched ", end="") evaluate(model, loss_fct, metric, dev_data_loader_matched) final_res1 = "matched " + final_res print("mismatched ", end="") evaluate(model, loss_fct, metric, dev_data_loader_mismatched) final_res2 = "mismatched " + final_res final_res = final_res1 + "\r\n" + final_res2 print("eval done total : %s s" % (time.time() - tic_eval)) else: evaluate(model, loss_fct, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) if (not paddle.distributed.get_world_size() > 1 ) or paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "%s_ft_model_%d" % (args.task_name, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if global_step == num_training_steps: print(final_res) exit(0) tic_train += time.time() - tic_eval
paddle.set_device(args.device) set_seed(args.seed) processor = processor_dict[args.task_name]() # Load test_ds for FewCLUE leaderboard test_ds = load_dataset("fewclue", name=args.task_name, splits=("test")) test_ds = processor.get_test_datasets(test_ds, TASK_LABELS_DESC[args.task_name]) model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained( 'ernie-1.0', num_classes=2) tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0') # [src_ids, token_type_ids] predict_batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # token_type_ids ): [data for data in fn(samples)] predict_trans_func = partial( convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length, is_test=True) test_data_loader = create_dataloader( test_ds, mode='eval', batch_size=args.batch_size, batchify_fn=predict_batchify_fn, trans_fn=predict_trans_func)
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.") args = parser.parse_args() # yapf: enable if __name__ == "__main__": paddle.set_device(args.device) tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0') trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # text_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # text_segment ): [data for data in fn(samples)] pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained( "ernie-1.0") model = SemanticIndexBase(pretrained_model, output_emb_size=args.output_emb_size) # load pretrained semantic model if args.params_path and os.path.isfile(args.params_path): state_dict = paddle.load(args.params_path) model.set_dict(state_dict) logger.info("Loaded parameters from %s" % args.params_path) else:
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) train_ds = load_dataset(read_text_pair, data_path=args.train_set_file, lazy=False) # If you wanna use bert/roberta pretrained model, # pretrained_model = ppnlp.transformers.BertModel.from_pretrained('bert-base-chinese') # pretrained_model = ppnlp.transformers.RobertaModel.from_pretrained('roberta-wwm-ext') pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained( 'ernie-1.0') # If you wanna use bert/roberta pretrained model, # tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese') # tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext') tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0') trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # query_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # query_segment Pad(axis=0, pad_val=tokenizer.pad_token_id), # title_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # tilte_segment ): [data for data in fn(samples)] train_data_loader = create_dataloader(train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) model = SemanticIndexBatchNeg(pretrained_model, margin=args.margin, scale=args.scale, output_emb_size=args.output_emb_size) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) print("warmup from:{}".format(args.init_from_ckpt)) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=args.amp_loss_scale) global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch with paddle.amp.auto_cast( args.use_amp, custom_white_list=["layer_norm", "softmax", "gelu"]): loss = model(query_input_ids=query_input_ids, title_input_ids=title_input_ids, query_token_type_ids=query_token_type_ids, title_token_type_ids=title_token_type_ids) if args.use_amp: scaled = scaler.scale(loss) scaled.backward() scaler.minimize(optimizer, scaled) else: loss.backward() optimizer.step() global_step += 1 if global_step % 10 == 0 and rank == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, 10 / (time.time() - tic_train))) tic_train = time.time() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 and rank == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) save_param_path = os.path.join(save_dir, 'model_state.pdparams') paddle.save(model.state_dict(), save_param_path) tokenizer.save_pretrained(save_dir)
def evaluate(): paddle.set_device(args.device) model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) dev_dataset = load_dataset('poetry', splits=('dev'), lazy=False) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example( tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) dev_dataset = dev_dataset.map(trans_func) dev_batch_sampler = paddle.io.BatchSampler( dev_dataset, batch_size=args.batch_size, shuffle=False) data_loader = DataLoader( dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) rouge1 = Rouge1() rouge2 = Rouge2() if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) model.eval() vocab = tokenizer.vocab eos_id = vocab[tokenizer.sep_token] sos_id = vocab[tokenizer.cls_token] pad_id = vocab[tokenizer.pad_token] unk_id = vocab[tokenizer.unk_token] vocab_size = len(vocab) evaluated_sentences_ids = [] reference_sentences_ids = [] logger.info("Evaluating...") for data in tqdm(data_loader): (src_ids, src_sids, src_pids, _, _, _, _, _, _, _, _, raw_tgt_labels) = data # never use target when infer # Use greedy_search_infilling or beam_search_infilling to get predictions output_ids = beam_search_infilling( model, src_ids, src_sids, eos_id=eos_id, sos_id=sos_id, attn_id=attn_id, pad_id=pad_id, unk_id=unk_id, vocab_size=vocab_size, max_decode_len=args.max_decode_len, max_encode_len=args.max_encode_len, beam_width=args.beam_width, length_penalty=args.length_penalty, tgt_type_id=tgt_type_id) for ids in output_ids.tolist(): if eos_id in ids: ids = ids[:ids.index(eos_id)] evaluated_sentences_ids.append(ids) for ids in raw_tgt_labels.numpy().tolist(): ids = ids[:ids.index(eos_id)] reference_sentences_ids.append(ids) score1 = rouge1.score(evaluated_sentences_ids, reference_sentences_ids) score2 = rouge2.score(evaluated_sentences_ids, reference_sentences_ids) logger.info("Rouge-1: %.5f ,Rouge-2: %.5f" % (score1 * 100, score2 * 100))
parser.add_argument("--label_path", type=str, default=None, help="The path of label dict.") parser.add_argument("--batch_size", type=int, default=32, help="Batch size per GPU/CPU for training.") parser.add_argument("--max_seq_len", type=int, default=512, help="The maximum total input sequence length after tokenization.") args = parser.parse_args() # yapf: enbale # load dev data label2id, id2label = load_dict(args.label_path) test_ds = load_dataset(read, data_path=args.test_path, lazy=False) tokenizer = PPMiniLMTokenizer.from_pretrained(args.base_model_name) trans_func = partial(convert_example_to_feature, tokenizer=tokenizer, label2id=label2id, max_seq_len=args.max_seq_len) test_ds = test_ds.map(trans_func, lazy=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), Stack(dtype="int64"), Stack(dtype="int64") ): fn(samples) test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False) test_loader = paddle.io.DataLoader(test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn) # load model loaded_state_dict = paddle.load(args.model_path) model = PPMiniLMForSequenceClassification.from_pretrained(args.base_model_name, num_classes=len(label2id)) model.load_dict(loaded_state_dict) metric = AccuracyAndF1()
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() rank = paddle.distributed.get_rank() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) set_seed(args) train_ds, dev_ds, test_ds = load_dataset('dureader_yesno', splits=['train', 'dev', 'test']) trans_func = partial(convert_example, tokenizer=tokenizer) train_batchify_fn = lambda samples, fn=Dict( { 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id), 'labels': Stack(dtype="int64") }): fn(samples) test_batchify_fn = lambda samples, fn=Dict( { 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id), 'id': Stack() }): fn(samples) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=train_batchify_fn, return_list=True) test_ds = test_ds.map(trans_func, lazy=True) test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False) test_data_loader = DataLoader(dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=test_batchify_fn, return_list=True) model = model_class.from_pretrained(args.model_name_or_path, num_classes=len(train_ds.label_list)) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() global_step = 0 tic_train = time.time() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, label = batch logits = model(input_ids=input_ids, token_type_ids=segment_ids) loss = criterion(logits, label) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch + 1, step + 1, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == num_training_steps: if rank == 0: evaluate(model, metric, dev_data_loader) output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) if global_step == num_training_steps: break if rank == 0: predictions = predict(model, test_data_loader) with open('prediction.json', "w") as writer: writer.write( json.dumps(predictions, ensure_ascii=False, indent=4) + "\n")
def predict(): paddle.set_device(args.device) model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) dev_dataset = load_dataset('poetry', splits=('dev'), lazy=False) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example(tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) dev_dataset = dev_dataset.map(trans_func) test_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) data_loader = DataLoader(dataset=dev_dataset, batch_sampler=test_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) model.eval() vocab = tokenizer.vocab eos_id = vocab[tokenizer.sep_token] sos_id = vocab[tokenizer.cls_token] pad_id = vocab[tokenizer.pad_token] unk_id = vocab[tokenizer.unk_token] vocab_size = len(vocab) evaluated_sentences = [] evaluated_sentences_ids = [] logger.info("Predicting...") for data in data_loader: (src_ids, src_sids, src_pids, _, _, _, _, _, _, _, _, raw_tgt_labels) = data # never use target when infer # Use greedy_search_infilling or beam_search_infilling to get predictions output_ids = beam_search_infilling(model, src_ids, src_sids, eos_id=eos_id, sos_id=sos_id, attn_id=attn_id, pad_id=pad_id, unk_id=unk_id, vocab_size=vocab_size, max_decode_len=args.max_decode_len, max_encode_len=args.max_encode_len, beam_width=args.beam_width, length_penalty=args.length_penalty, tgt_type_id=tgt_type_id) for source_ids, target_ids, predict_ids in zip( src_ids.numpy().tolist(), raw_tgt_labels.numpy().tolist(), output_ids.tolist()): if eos_id in predict_ids: predict_ids = predict_ids[:predict_ids.index(eos_id)] source_sentence = ''.join( map(post_process, vocab.to_tokens(source_ids[1:source_ids.index(eos_id)]))) tgt_sentence = ''.join( map(post_process, vocab.to_tokens(target_ids[1:target_ids.index(eos_id)]))) predict_ids = ''.join( map(post_process, vocab.to_tokens(predict_ids))) print("source :%s\ntarget :%s\npredict:%s\n" % (source_sentence, tgt_sentence, predict_ids))
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) set_seed(args) train_ds, dev_ds, test_ds = ppnlp.datasets.DuReaderYesNo.get_datasets( ['train', 'dev', 'test']) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.get_labels(), max_seq_length=args.max_seq_length) train_ds = train_ds.apply(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment Stack(), # length Stack(dtype="int64"), # start_pos ): [data for i, data in enumerate(fn(samples)) if i != 2] train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, return_list=True) dev_ds = dev_ds.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, return_list=True) test_trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.get_labels(), max_seq_length=args.max_seq_length, is_test=True) test_ds = test_ds.apply(test_trans_func, lazy=True) test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False) test_batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment Stack() # length ): fn(samples) test_data_loader = DataLoader(dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn, return_list=True) model = model_class.from_pretrained(args.model_name_or_path, num_classes=3) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_ds.examples) // args.batch_size * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) criterion = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, label = batch logits = model(input_ids=input_ids, token_type_ids=segment_ids) loss = criterion(logits, label) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.save_steps == 0: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: evaluate(model, metric, dev_data_loader) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: evaluate(model, metric, test_data_loader, True)
if __name__ == '__main__': paddle.set_device('gpu') train_ds = ExpressDataset('./data/train.txt') dev_ds = ExpressDataset('./data/dev.txt') test_ds = ExpressDataset('./data/test.txt') tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') trans_func = partial(convert_example, tokenizer=tokenizer, label_vocab=train_ds.label_vocab) ignore_label = -1 batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), Stack(), Pad(axis=0, pad_val=ignore_label)): fn(list(map(trans_func, samples))) train_loader = paddle.io.DataLoader(dataset=train_ds, batch_size=200, shuffle=True, return_list=True, collate_fn=batchify_fn) dev_loader = paddle.io.DataLoader(dataset=dev_ds, batch_size=200, return_list=True, collate_fn=batchify_fn) test_loader = paddle.io.DataLoader(dataset=test_ds, batch_size=200, return_list=True,
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() dataset_class, metric_class = TASK_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_dataset = dataset_class.get_datasets(["train"]) tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial( convert_example, tokenizer=tokenizer, label_list=train_dataset.get_labels(), max_seq_length=args.max_seq_length) train_dataset = train_dataset.apply(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment Stack(), # length Stack(dtype="int64" if train_dataset.get_labels() else "float32") # label ): [data for i, data in enumerate(fn(samples)) if i != 2] train_data_loader = DataLoader( dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) if args.task_name == "mnli": dev_dataset_matched, dev_dataset_mismatched = dataset_class.get_datasets( ["dev_matched", "dev_mismatched"]) dev_dataset_matched = dev_dataset_matched.apply(trans_func, lazy=True) dev_dataset_mismatched = dev_dataset_mismatched.apply( trans_func, lazy=True) dev_batch_sampler_matched = paddle.io.BatchSampler( dev_dataset_matched, batch_size=args.batch_size, shuffle=False) dev_data_loader_matched = DataLoader( dataset=dev_dataset_matched, batch_sampler=dev_batch_sampler_matched, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_batch_sampler_mismatched = paddle.io.BatchSampler( dev_dataset_mismatched, batch_size=args.batch_size, shuffle=False) dev_data_loader_mismatched = DataLoader( dataset=dev_dataset_mismatched, batch_sampler=dev_batch_sampler_mismatched, collate_fn=batchify_fn, num_workers=0, return_list=True) else: dev_dataset = dataset_class.get_datasets(["dev"]) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler( dev_dataset, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader( dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 1 if train_dataset.get_labels() == None else len( train_dataset.get_labels()) model = model_class.from_pretrained( args.model_name_or_path, num_classes=num_classes) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_train_epochs) warmup_steps = args.warmup_steps if args.warmup_steps > 0 else ( int(math.floor(num_training_steps * args.warmup_proportion))) lr_scheduler = paddle.optimizer.lr.LambdaDecay( args.learning_rate, lambda current_step, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps : float( current_step) / float(max(1, num_warmup_steps)) if current_step < num_warmup_steps else max( 0.0, float(num_training_steps - current_step) / float( max(1, num_training_steps - num_warmup_steps)))) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) loss_fct = paddle.nn.loss.CrossEntropyLoss() if train_dataset.get_labels( ) else paddle.nn.loss.MSELoss() metric = metric_class() global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = loss_fct(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0: tic_eval = time.time() if args.task_name == "mnli": evaluate(model, loss_fct, metric, dev_data_loader_matched) evaluate(model, loss_fct, metric, dev_data_loader_mismatched) print("eval done total : %s s" % (time.time() - tic_eval)) else: evaluate(model, loss_fct, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "%s_ft_model_%d.pdparams" % (args.task_name, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
decoder_input_ids = [labels[-1]] + labels[:-1] return src_ids, src_attention_mask_ids, decoder_input_ids, decoder_input_attention_mask_ids, labels else: return src_ids, src_attention_mask_ids return warpper trunc = convert_example() train_dataset = train_dataset.map(trunc) dev_dataset = dev_dataset.map(trunc) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=t.pad_token_id), # src_ids Pad(axis=0, pad_val=0), # src_pids Pad(axis=0, pad_val=t.pad_token_id), # tgt_ids Pad(axis=0, pad_val=0), # tgt_pids Pad(axis=0, pad_val=t.pad_token_id) # label ): fn(samples) batch_size = args.batch_size train_data_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=batchify_fn, use_shared_memory=False, num_workers=args.num_workers)