def train(args): if dist.get_rank() == 0: shutil.rmtree('log', ignore_errors=True) # 日志记录器 writer = LogWriter(logdir='log') # 设置支持多卡训练 if len(args.gpus.split(',')) > 1: dist.init_parallel_env() # 获取训练数据 train_dataset = PPASRDataset(args.train_manifest, args.dataset_vocab, mean_std_filepath=args.mean_std_path, min_duration=args.min_duration, max_duration=args.max_duration) batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True) train_loader = DataLoader(dataset=train_dataset, collate_fn=collate_fn, batch_sampler=batch_sampler, num_workers=args.num_workers) # 获取测试数据 test_dataset = PPASRDataset(args.test_manifest, args.dataset_vocab, mean_std_filepath=args.mean_std_path) batch_sampler = paddle.io.BatchSampler(test_dataset, batch_size=args.batch_size) test_loader = DataLoader(dataset=test_dataset, collate_fn=collate_fn, batch_sampler=batch_sampler, num_workers=args.num_workers) # 获取模型 model = DeepSpeech2Model(feat_size=train_dataset.feature_dim, dict_size=len(train_dataset.vocabulary), num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_size=args.rnn_layer_size) if dist.get_rank() == 0: print('input_size的第三个参数是变长的,这里为了能查看输出的大小变化,指定了一个值!') paddle.summary(model, input_size=[(None, train_dataset.feature_dim, 970), (None, )], dtypes=[paddle.float32, paddle.int64]) # 设置支持多卡训练 if len(args.gpus.split(',')) > 1: model = paddle.DataParallel(model) # 设置优化方法 clip = paddle.nn.ClipGradByNorm(clip_norm=3.0) # 获取预训练的epoch数 last_epoch = int(re.findall( r'\d+', args.resume)[-1]) if args.resume is not None else 0 scheduler = paddle.optimizer.lr.ExponentialDecay( learning_rate=args.learning_rate, gamma=0.83, last_epoch=last_epoch, verbose=True) optimizer = paddle.optimizer.Adam( parameters=model.parameters(), learning_rate=scheduler, weight_decay=paddle.regularizer.L2Decay(1e-06), grad_clip=clip) # 获取损失函数 ctc_loss = paddle.nn.CTCLoss() # 加载预训练模型 if args.pretrained_model is not None: model_dict = model.state_dict() model_state_dict = paddle.load( os.path.join(args.pretrained_model, 'model.pdparams')) # 特征层 for name, weight in model_dict.items(): if name in model_state_dict.keys(): if weight.shape != list(model_state_dict[name].shape): print('{} not used, shape {} unmatched with {} in model.'. format(name, list(model_state_dict[name].shape), weight.shape)) model_state_dict.pop(name, None) else: print('Lack weight: {}'.format(name)) model.set_dict(model_state_dict) print('成功加载预训练模型') # 加载预训练模型 if args.resume is not None: model.set_state_dict( paddle.load(os.path.join(args.resume, 'model.pdparams'))) optimizer.set_state_dict( paddle.load(os.path.join(args.resume, 'optimizer.pdopt'))) print('成功恢复模型参数和优化方法参数') train_step = 0 test_step = 0 # 开始训练 for epoch in range(last_epoch, args.num_epoch): for batch_id, (inputs, labels, input_lens, label_lens) in enumerate(train_loader()): out, out_lens = model(inputs, input_lens) out = paddle.transpose(out, perm=[1, 0, 2]) # 计算损失 loss = ctc_loss(out, labels, out_lens, label_lens) loss.backward() optimizer.step() optimizer.clear_grad() # 多卡训练只使用一个进程打印 if batch_id % 100 == 0 and dist.get_rank() == 0: print('[%s] Train epoch %d, batch %d, loss: %f' % (datetime.now(), epoch, batch_id, loss)) writer.add_scalar('Train loss', loss, train_step) train_step += 1 # 固定步数也要保存一次模型 if batch_id % 2000 == 0 and batch_id != 0 and dist.get_rank() == 0: # 保存模型 save_model(args=args, epoch=epoch, model=model, optimizer=optimizer) # 多卡训练只使用一个进程执行评估和保存模型 if dist.get_rank() == 0: # 执行评估 model.eval() c = evaluate(model, test_loader, test_dataset.vocabulary) print('\n', '=' * 70) print('[%s] Test epoch %d, cer: %f' % (datetime.now(), epoch, c)) print('=' * 70) writer.add_scalar('Test cer', c, test_step) test_step += 1 model.train() # 记录学习率 writer.add_scalar('Learning rate', scheduler.last_lr, epoch) # 保存模型 save_model(args=args, epoch=epoch, model=model, optimizer=optimizer) scheduler.step()
def create_data_loader(args, places=None): if args.train_file is not None and args.dev_file is not None: datasets = load_dataset('wmt14ende', data_files=[args.train_file, args.dev_file], splits=('train', 'dev')) elif args.train_file is None and args.dev_file is None: datasets = load_dataset('wmt14ende', splits=('train', 'dev')) else: raise ValueError( "--train_file and --dev_file must be both or neither set. ") if args.vocab_file is not None: src_vocab = Vocab.load_vocabulary(filepath=args.vocab_file, unk_token=args.unk_token, bos_token=args.bos_token, eos_token=args.eos_token) elif not args.benchmark: src_vocab = Vocab.load_vocabulary(**datasets[0].vocab_info["bpe"]) else: src_vocab = Vocab.load_vocabulary( **datasets[0].vocab_info["benchmark"]) trg_vocab = src_vocab padding_vocab = ( lambda x: (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor) args.src_vocab_size = padding_vocab(len(src_vocab)) args.trg_vocab_size = padding_vocab(len(trg_vocab)) def convert_samples(sample): source = sample[args.src_lang].split() target = sample[args.trg_lang].split() source = src_vocab.to_indices(source) target = trg_vocab.to_indices(target) return source, target data_loaders = [(None)] * 2 for i, dataset in enumerate(datasets): dataset = dataset.map(convert_samples, lazy=False).filter( partial(min_max_filer, max_len=args.max_length)) batch_sampler = TransformerBatchSampler( dataset=dataset, batch_size=args.batch_size, pool_size=args.pool_size, sort_type=args.sort_type, shuffle=args.shuffle, shuffle_batch=args.shuffle_batch, use_token_batch=True, max_length=args.max_length, distribute_mode=True if i == 0 else False, world_size=dist.get_world_size(), rank=dist.get_rank(), pad_seq=args.pad_seq, bsz_multi=args.bsz_multi) data_loader = DataLoader(dataset=dataset, places=places, batch_sampler=batch_sampler, collate_fn=partial(prepare_train_input, bos_idx=args.bos_idx, eos_idx=args.eos_idx, pad_idx=args.bos_idx, pad_seq=args.pad_seq, dtype=args.input_dtype), num_workers=args.num_workers) data_loaders[i] = (data_loader) return data_loaders
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Create dataset, tokenizer and dataloader. train_ds, test_ds = load_dataset('msra_ner', splits=('train', 'test'), lazy=False) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) train_ds = train_ds.map(trans_func) ignore_label = -100 batchify_fn = lambda samples, fn=Dict( { 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id ), # segment 'seq_len': Stack(), # seq_len 'labels': Pad(axis=0, pad_val=ignore_label) # label }): fn(samples) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_data_loader = DataLoader(dataset=train_ds, collate_fn=batchify_fn, num_workers=0, batch_sampler=train_batch_sampler, return_list=True) test_ds = test_ds.map(trans_func) test_data_loader = DataLoader(dataset=test_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) global_step = 0 last_step = args.num_train_epochs * len(train_data_loader) tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, _, labels = batch logits = model(input_ids, token_type_ids) loss = loss_fct(logits, labels) avg_loss = paddle.mean(loss) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, avg_loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() avg_loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == last_step: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: evaluate(model, loss_fct, metric, test_data_loader, label_num) paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step))
def do_train(args): paddle.enable_static() if not args.eager_run else None paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() dataset_class, metric_class = TASK_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_dataset, dev_dataset = dataset_class.get_datasets(["train", "dev"]) tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_dataset.get_labels(), max_seq_length=args.max_seq_length) train_dataset = train_dataset.apply(trans_func, lazy=True) # train_batch_sampler = SamplerHelper(train_dataset).shuffle().batch( # batch_size=args.batch_size).shard() train_batch_sampler = paddle.io.DistributedBatchSampler( # train_dataset, batch_size=args.batch_size, shuffle=True) train_dataset, batch_size=args.batch_size, shuffle=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment Stack(), # length Stack(dtype="int64" if train_dataset.get_labels() else "float32") # label ): [data for i, data in enumerate(fn(samples)) if i != 2] train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_dataset = dev_dataset.apply(trans_func, lazy=True) # dev_batch_sampler = SamplerHelper(dev_dataset).batch( # batch_size=args.batch_size) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) # model = model_class.from_pretrained( # args.model_name_or_path,) num_classes=len(train_dataset.get_labels())) model = BertForPreTraining( BertModel(**model_class.pretrained_init_configuration[ args.model_name_or_path])) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) lr_scheduler = paddle.optimizer.lr.LambdaDecay( args.learning_rate, lambda current_step, num_warmup_steps=args.warmup_steps, num_training_steps=args.max_steps if args.max_steps > 0 else (len(train_data_loader) * args.num_train_epochs): float( current_step) / float(max(1, num_warmup_steps)) if current_step < num_warmup_steps else max( 0.0, float(num_training_steps - current_step) / float( max(1, num_training_steps - num_warmup_steps)))) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) loss_fct = paddle.nn.loss.CrossEntropyLoss() if train_dataset.get_labels( ) else paddle.nn.loss.MSELoss() metric = metric_class() ### TODO: use hapi # trainer = paddle.hapi.Model(model) # trainer.prepare(optimizer, loss_fct, paddle.metric.Accuracy()) # trainer.fit(train_data_loader, # dev_data_loader, # log_freq=args.logging_steps, # epochs=args.num_train_epochs, # save_dir=args.output_dir) model.eval() param_names = list(model.state_dict().keys()) import pickle with open(args.params_pd_path, "rb") as f: np_params = pickle.load(f) model.set_state_dict(dict(zip(param_names, np_params))) paddle.save(model.state_dict(), "%s.pdparams" % args.model_name_or_path) for data in train_data_loader(): print(model(*data[:-1])) exit(0) global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = loss_fct(logits, labels) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0: evaluate(model, loss_fct, metric, dev_data_loader) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) global_step += 1
def create_data_loader(args): root = None if args.root == "None" else args.root (src_vocab, trg_vocab) = WMT14ende.get_vocab(root=root) args.src_vocab_size, args.trg_vocab_size = len(src_vocab), len(trg_vocab) transform_func = WMT14ende.get_default_transform_func(root=root) datasets = [ WMT14ende.get_datasets(mode=m, transform_func=transform_func) for m in ["train", "dev"] ] if args.shuffle or args.shuffle_batch: if args.shuffle_seed == "None" or args.shuffle_seed is None: shuffle_seed = 0 else: shuffle_seed = args.shuffle_seed def _max_token_fn(current_idx, current_batch_size, tokens_sofar, data_source): return max(tokens_sofar, len(data_source[current_idx][0]) + 1, len(data_source[current_idx][1]) + 1) def _key(size_so_far, minibatch_len): return size_so_far * minibatch_len data_loaders = [(None)] * 2 for i, dataset in enumerate(datasets): m = dataset.mode dataset = dataset.filter( partial(min_max_filer, max_len=args.max_length)) sampler = SamplerHelper(dataset) src_key = (lambda x, data_source: len(data_source[x][0]) + 1) if args.sort_type == SortType.GLOBAL: buffer_size = -1 trg_key = (lambda x, data_source: len(data_source[x][1]) + 1) # Sort twice sampler = sampler.sort(key=trg_key, buffer_size=buffer_size).sort( key=src_key, buffer_size=buffer_size) else: if args.shuffle: sampler = sampler.shuffle(seed=shuffle_seed) if args.sort_type == SortType.POOL: buffer_size = args.pool_size sampler = sampler.sort(key=src_key, buffer_size=buffer_size) batch_sampler = sampler.batch(batch_size=args.batch_size, drop_last=False, batch_size_fn=_max_token_fn, key=_key) if m == "train": batch_sampler = batch_sampler.shard() if args.shuffle_batch: batch_sampler.shuffle(seed=shuffle_seed) data_loader = DataLoader(dataset=dataset, batch_sampler=batch_sampler, collate_fn=partial(prepare_train_input, bos_idx=args.bos_idx, eos_idx=args.eos_idx, pad_idx=args.bos_idx), num_workers=0, return_list=True) data_loaders[i] = (data_loader) return data_loaders
def main(): paddle.enable_static() if FLAGS.dynamic else None if not FLAGS.eval_only: # training mode train_transform = Compose([ ColorDistort(), RandomExpand(), RandomCrop(), RandomFlip(), NormalizeBox(), PadBox(), BboxXYXY2XYWH() ]) train_collate_fn = BatchCompose([RandomShape(), NormalizeImage()]) dataset = COCODataset(dataset_dir=FLAGS.data, anno_path='annotations/instances_train2017.json', image_dir='train2017', with_background=False, mixup=True, transform=train_transform) batch_sampler = DistributedBatchSampler(dataset, batch_size=FLAGS.batch_size, shuffle=True, drop_last=True) loader = DataLoader(dataset, batch_sampler=batch_sampler, num_workers=FLAGS.num_workers, return_list=True, collate_fn=train_collate_fn) else: # evaluation mode eval_transform = Compose([ ResizeImage(target_size=608), NormalizeBox(), PadBox(), BboxXYXY2XYWH() ]) eval_collate_fn = BatchCompose([NormalizeImage()]) dataset = COCODataset(dataset_dir=FLAGS.data, anno_path='annotations/instances_val2017.json', image_dir='val2017', with_background=False, transform=eval_transform) # batch_size can only be 1 in evaluation for YOLOv3 # prediction bbox is a LoDTensor batch_sampler = DistributedBatchSampler(dataset, batch_size=1, shuffle=False, drop_last=False) loader = DataLoader(dataset, batch_sampler=batch_sampler, num_workers=FLAGS.num_workers, return_list=True, collate_fn=eval_collate_fn) pretrained = FLAGS.eval_only and FLAGS.weights is None model = yolov3_darknet53(num_classes=dataset.num_classes, num_max_boxes=NUM_MAX_BOXES, model_mode='eval' if FLAGS.eval_only else 'train', pretrained=pretrained) if FLAGS.pretrain_weights and not FLAGS.eval_only: model.load(FLAGS.pretrain_weights, skip_mismatch=True, reset_optimizer=True) optim = make_optimizer(len(batch_sampler), parameter_list=model.parameters()) model.prepare(optimizer=optim, loss=YoloLoss(num_classes=dataset.num_classes)) # NOTE: we implement COCO metric of YOLOv3 model here, separately # from 'prepare' and 'fit' framework for follwing reason: # 1. YOLOv3 network structure is different between 'train' and # 'eval' mode, in 'eval' mode, output prediction bbox is not the # feature map used for YoloLoss calculating # 2. COCO metric behavior is also different from defined Metric # for COCO metric should not perform accumulate in each iteration # but only accumulate at the end of an epoch if FLAGS.eval_only: if FLAGS.weights is not None: model.load(FLAGS.weights, reset_optimizer=True) preds = model.predict(loader, stack_outputs=False) _, _, _, img_ids, bboxes = preds anno_path = os.path.join(FLAGS.data, 'annotations/instances_val2017.json') coco_metric = COCOMetric(anno_path=anno_path, with_background=False) for img_id, bbox in zip(img_ids, bboxes): coco_metric.update(img_id, bbox) coco_metric.accumulate() coco_metric.reset() return if FLAGS.resume is not None: model.load(FLAGS.resume) save_dir = FLAGS.save_dir or 'yolo_checkpoint' model.fit(train_data=loader, epochs=FLAGS.epoch - FLAGS.no_mixup_epoch, save_dir=os.path.join(save_dir, "mixup"), save_freq=10) # do not use image mixup transfrom in the last FLAGS.no_mixup_epoch epoches dataset.mixup = False model.fit(train_data=loader, epochs=FLAGS.no_mixup_epoch, save_dir=os.path.join(save_dir, "no_mixup"), save_freq=5)
def __getitem__(self, idx): data = self.segment[idx] with data.open() as fp: image_tensor = self.transform(Image.open(fp)) return image_tensor, self.category_to_index[ data.label.classification.category] # """""" """Build a dataloader and run it""" # Please visit `https://gas.graviti.cn/tensorbay/developer` to get the AccessKey. ACCESS_KEY = "<YOUR_ACCESSKEY>" to_tensor = transforms.ToTensor() normalization = transforms.Normalize(mean=[0.485], std=[0.229]) my_transforms = transforms.Compose([to_tensor, normalization]) train_segment = DogsVsCatsSegment(GAS(ACCESS_KEY), segment_name="train", transform=my_transforms) train_dataloader = DataLoader(train_segment, batch_size=4, shuffle=True, num_workers=0) for index, (image, label) in enumerate(train_dataloader): print(f"{index}: {label}") """"""
def predict2file(args): if args.task_name == "mnli": test_ds_matched, test_ds_mismatched = load_dataset( "glue", "mnli", splits=["test_matched", "test_mismatched"]) id2label = dict( zip(range(len(test_ds_matched.label_list)), test_ds_matched.label_list)) else: test_ds = load_dataset("glue", args.task_name, splits="test") if test_ds.label_list is not None: id2label = dict( zip(range(len(test_ds.label_list)), test_ds.label_list)) else: id2label = None model = MPNetForSequenceClassification.from_pretrained(args.ckpt_path) model.eval() tokenizer = MPNetTokenizer.from_pretrained(args.ckpt_path) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), Pad(axis=0, pad_val=tokenizer.pad_token_type_id), ): fn(samples) trans_func = partial( convert_example, tokenizer=tokenizer, label_list=None, max_seq_length=args.max_seq_length, is_test=True, ) if args.task_name == "mnli": test_ds_matched = test_ds_matched.map(trans_func, lazy=True) test_ds_mismatched = test_ds_mismatched.map(trans_func, lazy=True) test_batch_sampler_matched = paddle.io.BatchSampler( test_ds_matched, batch_size=args.batch_size, shuffle=False) test_data_loader_matched = DataLoader( dataset=test_ds_matched, batch_sampler=test_batch_sampler_matched, collate_fn=batchify_fn, num_workers=2, return_list=True, ) test_batch_sampler_mismatched = paddle.io.BatchSampler( test_ds_mismatched, batch_size=args.batch_size, shuffle=False) test_data_loader_mismatched = DataLoader( dataset=test_ds_mismatched, batch_sampler=test_batch_sampler_mismatched, collate_fn=batchify_fn, num_workers=2, return_list=True, ) file_m = os.path.join("template", task2filename[args.task_name][0]) file_mm = os.path.join("template", task2filename[args.task_name][1]) matched_outputs = predict(test_data_loader_matched, model, id2label) mismatched_outputs = predict(test_data_loader_mismatched, model, id2label) writetsv(matched_outputs, file_m) writetsv(mismatched_outputs, file_mm) else: test_ds = test_ds.map(trans_func, lazy=True) test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False) test_data_loader = DataLoader( dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn, num_workers=2, return_list=True, ) predict_outputs = predict(test_data_loader, model, id2label) file = os.path.join("template", task2filename[args.task_name]) writetsv(predict_outputs, file)
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_ds, dev_ds = load_dataset('clue', args.task_name, splits=('train', 'dev')) tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, label_list=train_ds.label_list, tokenizer=tokenizer, max_seq_length=args.max_seq_length) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64" if train_ds.label_list else "float32") # label ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 1 if train_ds.label_list == None else len( train_ds.label_list) model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_classes) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) if args.max_steps > 0: num_training_steps = args.max_steps num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len(train_data_loader) * args.num_train_epochs num_train_epochs = args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) loss_fct = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() metric = metric_class() best_acc = 0.0 global_step = 0 tic_train = time.time() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = loss_fct(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: tic_eval = time.time() acc = evaluate(model, loss_fct, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) if acc > best_acc: best_acc = acc output_dir = args.output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if global_step >= num_training_steps: print("best_acc: ", best_acc) return print("best_acc: ", best_acc)
def prepare_dataloader(dataset): return DataLoader(dataset, places=places, num_workers=num_workers, batch_size=1, drop_last=True)
def test_main(self): place = fluid.cpu_places()[0] with fluid.dygraph.guard(place): dataset = RandomDataset(100) batch_sampler = BatchSampler(dataset=dataset, batch_size=4) # dataset is not instance of Dataset try: loader = DataLoader(dataset=batch_sampler, places=place) self.assertTrue(False) except AssertionError: pass # places is None try: loader = DataLoader(dataset=dataset, places=None) self.assertTrue(False) except AssertionError: pass # num_workers < 0 try: loader = DataLoader(dataset=dataset, places=place, num_workers=-1) self.assertTrue(False) except AssertionError: pass # timeout < 0 try: loader = DataLoader(dataset=dataset, places=place, timeout=-1) self.assertTrue(False) except AssertionError: pass # batch_sampler is not instance of BatchSampler try: loader = DataLoader(dataset=dataset, places=place, batch_sampler=dataset) self.assertTrue(False) except AssertionError: pass # set batch_sampler and shuffle/batch_size/drop_last try: loader = DataLoader(dataset=dataset, places=place, batch_sampler=batch_sampler, shuffle=True, drop_last=True) self.assertTrue(False) except AssertionError: pass # set batch_sampler correctly try: loader = DataLoader(dataset=dataset, places=place, batch_sampler=batch_sampler) self.assertTrue(True) except AssertionError: self.assertTrue(False)
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_ds = load_dataset('clue', args.task_name, splits='train') tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, label_list=train_ds.label_list, tokenizer=tokenizer, max_seq_length=args.max_seq_length) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64" if train_ds.label_list else "float32") # label ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_ds = load_dataset('clue', args.task_name, splits='dev') dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_labels = 1 if train_ds.label_list == None else len(train_ds.label_list) model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_labels) # Step1: Initialize a dictionary to save the weights from the origin PPMiniLM model. origin_weights = model.state_dict() # Step2: Convert origin model to supernet. sp_config = supernet(expand_ratio=[1.0]) model = Convert(sp_config).convert(model) # Use weights saved in the dictionary to initialize supernet. utils.set_state_dict(model, origin_weights) del origin_weights super_sd = paddle.load( os.path.join(args.model_name_or_path, 'model_state.pdparams')) model.set_state_dict(super_sd) # Step3: Define teacher model. teacher_model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_labels) # Step4: Config about distillation. mapping_layers = ['ppminilm.embeddings'] for idx in range(model.ppminilm.config['num_hidden_layers']): mapping_layers.append('ppminilm.encoder.layers.{}'.format(idx)) default_distill_config = { 'lambda_distill': 0.1, 'teacher_model': teacher_model, 'mapping_layers': mapping_layers, } distill_config = DistillConfig(**default_distill_config) # Step5: Config in supernet training. ofa_model = OFA(model, distill_config=distill_config, elastic_order=['width']) criterion = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() metric = metric_class() #### Step6: Calculate the importance of neurons and head, #### and then reorder them according to the importance. head_importance, neuron_importance = nlp_utils.compute_neuron_head_importance( args.task_name, ofa_model.model, dev_data_loader, loss_fct=criterion, num_layers=model.ppminilm.config['num_hidden_layers'], num_heads=model.ppminilm.config['num_attention_heads']) reorder_neuron_head(ofa_model.model, head_importance, neuron_importance) if paddle.distributed.get_world_size() > 1: ofa_model.model = paddle.DataParallel(ofa_model.model) if args.max_steps > 0: num_training_steps = args.max_steps num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len(train_data_loader) * args.num_train_epochs num_train_epochs = args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) global_step = 0 tic_train = time.time() best_res = 0.0 for epoch in range(num_train_epochs): # Step7: Set current epoch and task. ofa_model.set_epoch(epoch) ofa_model.set_task('width') for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, labels = batch for width_mult in args.width_mult_list: # Step8: Broadcast supernet config from width_mult, # and use this config in supernet training. net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) logits, teacher_logits = ofa_model(input_ids, segment_ids, attention_mask=[None, None]) rep_loss = ofa_model.calc_distill_loss() logit_loss = soft_cross_entropy(logits, teacher_logits.detach()) loss = rep_loss + args.lambda_logit * logit_loss loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: tic_eval = time.time() evaluate(teacher_model, metric, dev_data_loader, width_mult=100) print("eval done total : %s s" % (time.time() - tic_eval)) for idx, width_mult in enumerate(args.width_mult_list): net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) tic_eval = time.time() res = evaluate(ofa_model, metric, dev_data_loader, width_mult) print("eval done total : %s s" % (time.time() - tic_eval)) if best_res < res: output_dir = args.output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) best_res = res if global_step >= num_training_steps: print("best_res: ", best_res) return print("best_res: ", best_res)
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Reads label_map. label_map_path = os.path.join(args.data_path, "predicate2id.json") if not (os.path.exists(label_map_path) and os.path.isfile(label_map_path)): sys.exit("{} dose not exists or is not a file.".format(label_map_path)) with open(label_map_path, 'r', encoding='utf8') as fp: label_map = json.load(fp) num_classes = (len(label_map.keys()) - 2) * 2 + 2 # Loads pretrained model ERNIE model = ErnieForTokenClassification.from_pretrained( "ernie-1.0", num_classes=num_classes) model = paddle.DataParallel(model) tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0") criterion = BCELossForDuIE() # Loads dataset. train_dataset = DuIEDataset.from_file( os.path.join(args.data_path, 'train_data.json'), tokenizer, args.max_seq_length, True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) collator = DataCollator() train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=collator, return_list=True) eval_file_path = os.path.join(args.data_path, 'dev_data.json') test_dataset = DuIEDataset.from_file(eval_file_path, tokenizer, args.max_seq_length, True) test_batch_sampler = paddle.io.BatchSampler(test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) test_data_loader = DataLoader(dataset=test_dataset, batch_sampler=test_batch_sampler, collate_fn=collator, return_list=True) # Defines learning rate strategy. steps_by_epoch = len(train_data_loader) num_training_steps = steps_by_epoch * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_ratio) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) # Starts training. global_step = 0 logging_steps = 50 save_steps = 10000 tic_train = time.time() for epoch in range(args.num_train_epochs): print("\n=====start training of %d epochs=====" % epoch) tic_epoch = time.time() model.train() for step, batch in enumerate(train_data_loader): input_ids, seq_lens, tok_to_orig_start_index, tok_to_orig_end_index, labels = batch logits = model(input_ids=input_ids) mask = (input_ids != 0).logical_and((input_ids != 1)).logical_and( (input_ids != 2)) loss = criterion(logits, labels, mask) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() loss_item = loss.numpy().item() global_step += 1 if global_step % logging_steps == 0 and rank == 0: print( "epoch: %d / %d, steps: %d / %d, loss: %f, speed: %.2f step/s" % (epoch, args.num_train_epochs, step, steps_by_epoch, loss_item, logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % save_steps == 0 and rank == 0: print("\n=====start evaluating ckpt of %d steps=====" % global_step) precision, recall, f1 = evaluate(model, criterion, test_data_loader, eval_file_path, "eval") print("precision: %.2f\t recall: %.2f\t f1: %.2f\t" % (100 * precision, 100 * recall, 100 * f1)) print("saving checkpoing model_%d.pdparams to %s " % (global_step, args.output_dir)) paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) model.train() # back to train mode tic_epoch = time.time() - tic_epoch print("epoch time footprint: %d hour %d min %d sec" % (tic_epoch // 3600, (tic_epoch % 3600) // 60, tic_epoch % 60)) # Does final evaluation. if rank == 0: print("\n=====start evaluating last ckpt of %d steps=====" % global_step) precision, recall, f1 = evaluate(model, criterion, test_data_loader, eval_file_path, "eval") print("precision: %.2f\t recall: %.2f\t f1: %.2f\t" % (100 * precision, 100 * recall, 100 * f1)) paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) print("\n=====training complete=====")
def do_eval(args): paddle.set_device(args.device) # Create dataset, tokenizer and dataloader. train_ds, eval_ds = load_dataset('msra_ner', split=('train', 'test')) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.features['ner_tags'].feature.names label_num = len(label_list) no_entity_id = 0 def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples['tokens'], max_seq_len=args.max_seq_length, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, return_length=True) labels = [] for i, label in enumerate(examples['ner_tags']): label_ids = label if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids): label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) - 2] label_ids = [no_entity_id] + label_ids + [no_entity_id] label_ids += [no_entity_id] * ( len(tokenized_inputs['input_ids'][i]) - len(label_ids)) labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs ignore_label = -100 batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32' ), # segment 'seq_len': Stack(dtype='int64'), 'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64') # label }): fn(samples) eval_ds = eval_ds.select(range(len(eval_ds) - 1)) eval_ds = eval_ds.map(tokenize_and_align_labels, batched=True) eval_data_loader = DataLoader(dataset=eval_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) model.eval() metric.reset() for step, batch in enumerate(eval_data_loader): input_ids, token_type_ids, length, labels = batch logits = model(input_ids, token_type_ids) loss = loss_fct(logits, labels) avg_loss = paddle.mean(loss) preds = logits.argmax(axis=2) num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( length, preds, labels) metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) precision, recall, f1_score = metric.accumulate() print("eval loss: %f, precision: %f, recall: %f, f1: %f" % (avg_loss, precision, recall, f1_score))
def train(args): # 设置支持多卡训练 if len(args.gpus.split(',')) > 1: dist.init_parallel_env() if dist.get_rank() == 0: shutil.rmtree('log', ignore_errors=True) # 日志记录器 writer = LogWriter(logdir='log') # 获取数据 train_dataset = CustomDataset(args.train_root_path, is_train=False) # 设置支持多卡训练 if len(args.gpus.split(',')) > 1: batch_sampler = paddle.io.DistributedBatchSampler(train_dataset, batch_size=args.batch_size, shuffle=True) else: batch_sampler = paddle.io.BatchSampler(train_dataset, batch_size=args.batch_size, shuffle=True) train_loader = DataLoader(dataset=train_dataset, batch_sampler=batch_sampler, num_workers=args.num_workers) print("[%s] 总数据类别为:%d" % (datetime.now(), train_dataset.num_classes)) # 获取模型,贴心的作者同时提供了resnet的模型,以满足不同情况的使用 if args.use_model == 'resnet_face34': model = resnet_face34() else: model = MobileFaceNet() metric_fc = ArcNet(feature_dim=512, class_dim=train_dataset.num_classes) if dist.get_rank() == 0: paddle.summary(model, input_size=(None, 3, 112, 112)) # 设置支持多卡训练 if len(args.gpus.split(',')) > 1: model = paddle.DataParallel(model) metric_fc = paddle.DataParallel(metric_fc) # 获取预训练的epoch数 last_epoch = int(re.findall(r'\d+', args.resume)[-1]) + 1 if args.resume is not None else 0 # 学习率衰减 scheduler = paddle.optimizer.lr.StepDecay(learning_rate=args.learning_rate, step_size=10, gamma=0.1, last_epoch=last_epoch, verbose=True) # 设置优化方法 optimizer = paddle.optimizer.Momentum(parameters=model.parameters() + metric_fc.parameters(), learning_rate=scheduler, momentum=0.9, weight_decay=paddle.regularizer.L2Decay(5e-4)) # 加载预训练模型 if args.pretrained_model is not None: model_dict = model.state_dict() model_state_dict = paddle.load(os.path.join(args.pretrained_model, 'model.pdparams')) # 特征层 for name, weight in model_dict.items(): if name in model_state_dict.keys(): if weight.shape != list(model_state_dict[name].shape): print('{} not used, shape {} unmatched with {} in model.'. format(name, list(model_state_dict[name].shape), weight.shape)) model_state_dict.pop(name, None) else: print('Lack weight: {}'.format(name)) model.set_dict(model_state_dict) print('[%s] Rank %d 成功加载 model 参数' % (datetime.now(), dist.get_rank())) # 恢复训练 if args.resume is not None: model.set_state_dict(paddle.load(os.path.join(args.resume, 'model.pdparams'))) metric_fc.set_state_dict(paddle.load(os.path.join(args.resume, 'metric_fc.pdparams'))) optimizer.set_state_dict(paddle.load(os.path.join(args.resume, 'optimizer.pdopt'))) print('[%s] Rank %d 成功加载模型参数和优化方法参数' % (datetime.now(), dist.get_rank())) # 获取损失函数 loss = paddle.nn.CrossEntropyLoss() train_step = 0 test_step = 0 sum_batch = len(train_loader) * (args.num_epoch - last_epoch) # 开始训练 for epoch in range(last_epoch, args.num_epoch): loss_sum = [] accuracies = [] for batch_id, (img, label) in enumerate(train_loader()): start = time.time() feature = model(img) output = metric_fc(feature, label) # 计算损失值 los = loss(output, label) los.backward() optimizer.step() optimizer.clear_grad() # 计算准确率 label = paddle.reshape(label, shape=(-1, 1)) acc = accuracy(input=paddle.nn.functional.softmax(output), label=label) accuracies.append(acc.numpy()[0]) loss_sum.append(los.numpy()[0]) # 多卡训练只使用一个进程打印 if batch_id % 100 == 0 and dist.get_rank() == 0: eta_sec = ((time.time() - start) * 1000) * (sum_batch - (epoch - last_epoch) * len(train_loader) - batch_id) eta_str = str(timedelta(seconds=int(eta_sec / 1000))) print('[%s] Train epoch %d, batch: %d/%d, loss: %f, accuracy: %f, eta: %s' % ( datetime.now(), epoch, batch_id, len(train_loader), sum(loss_sum) / len(loss_sum), sum(accuracies) / len(accuracies), eta_str)) writer.add_scalar('Train loss', los, train_step) train_step += 1 loss_sum = [] # 多卡训练只使用一个进程执行评估和保存模型 if dist.get_rank() == 0: print('='*70) acc = test(model) print('[%s] Test %d, accuracy: %f' % (datetime.now(), epoch, acc)) print('='*70) writer.add_scalar('Test acc', acc, test_step) # 记录学习率 writer.add_scalar('Learning rate', scheduler.last_lr, epoch) test_step += 1 save_model(args, epoch, model, metric_fc, optimizer) scheduler.step() save_model(args, args.num_epoch, model, metric_fc, optimizer)
parameters=generator.parameters(), beta1=0.5, beta2=0.999) optimizerD = paddle.optimizer.Adam(learning_rate=LR, parameters=discriminator.parameters(), beta1=0.5, beta2=0.999) # 损失函数 bce_loss = nn.BCELoss() l1_loss = nn.L1Loss() # dataloader data_loader_train = DataLoader(paired_dataset_train, batch_size=BATCH_SIZE, shuffle=True, drop_last=True) data_loader_test = DataLoader(paired_dataset_test, batch_size=BATCH_SIZE) # In[11]: results_save_path = 'work/results' os.makedirs(results_save_path, exist_ok=True) # 保存每个epoch的测试结果 weights_save_path = 'work/weights' os.makedirs(weights_save_path, exist_ok=True) # 保存模型 for epoch in range(EPOCHS): for data in tqdm(data_loader_train): real_A, real_B = data
def eval(): paddle.disable_static() n_gpus = dist.get_world_size() rank = dist.get_rank() if n_gpus > 1: dist.init_parallel_env() args = parse_args() if not args.init_from_ckpt: raise ValueError('init_from_ckpt should be set when eval.') vocab = load_vocab(args.vocab_file, args.max_characters_per_token) elmo = ELMo(args.batch_size, args.char_embed_dim, args.projection_dim, vocab.size, dropout=args.dropout, num_layers=args.num_layers, num_highways=args.num_highways, char_vocab_size=vocab.char_size) if n_gpus > 1: elmo = paddle.DataParallel(elmo) elmo.eval() elmo_loss = ELMoLoss() # Loads pre-trained parameters. weight_state_dict = paddle.load(args.init_from_ckpt + '.pdparams') elmo.set_state_dict(weight_state_dict) print("Loaded checkpoint from %s" % args.init_from_ckpt) dev_dataset = OneBillionWordDataset(args.dev_data_path, vocab, args.batch_size, args.unroll_steps, n_gpus, rank, mode='test', shuffle=False, seed=args.random_seed) dev_dataloader = DataLoader(dev_dataset, return_list=True, batch_size=None) total_step = total_loss = 0 total_time = 0.0 batch_start_time = time.time() for step, inputs in enumerate(dev_dataloader, start=1): ids, next_ids, ids_reverse, next_ids_reverse = inputs outputs = elmo([ids, ids_reverse]) loss = elmo_loss(outputs, [next_ids, next_ids_reverse]) ppl = paddle.exp(loss) total_loss += loss.numpy()[0] total_step += 1 total_time += (time.time() - batch_start_time) if rank == 0: if step % args.log_freq == 0: print( "Eval step %d - loss: %.4f - Perplexity: %.4f - %.3fs/step" % (step, loss.numpy()[0] * args.unroll_steps, ppl.numpy()[0], total_time / args.log_freq)) total_time = 0.0 batch_start_time = time.time() avg_loss = total_loss / total_step avg_ppl = math.exp(avg_loss) if rank == 0: print("Eval - average loss: %.4f - average Perplexity: %.4f" % (avg_loss * args.unroll_steps, avg_ppl))
def do_predict(args): paddle.set_device(args.device) # Create dataset, tokenizer and dataloader. train_ds, predict_ds = load_dataset('msra_ner', splits=('train', 'test'), lazy=False) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) ignore_label = -100 batchify_fn = lambda samples, fn=Dict( { 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id ), # segment 'seq_len': Stack(), 'labels': Pad(axis=0, pad_val=ignore_label) # label }): fn(samples) raw_data = predict_ds.data id2label = dict(enumerate(predict_ds.label_list)) predict_ds = predict_ds.map(trans_func) predict_data_loader = DataLoader(dataset=predict_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) model.eval() pred_list = [] len_list = [] for step, batch in enumerate(predict_data_loader): input_ids, token_type_ids, length, labels = batch logits = model(input_ids, token_type_ids) pred = paddle.argmax(logits, axis=-1) pred_list.append(pred.numpy()) len_list.append(length.numpy()) preds = parse_decodes(raw_data, id2label, pred_list, len_list) file_path = "results.txt" with open(file_path, "w", encoding="utf8") as fout: fout.write("\n".join(preds)) # Print some examples print( "The results have been saved in the file: %s, some examples are shown below: " % file_path) print("\n".join(preds[:10]))
def run(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() rank = paddle.distributed.get_rank() task_name = args.task_name.lower() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) set_seed(args) if rank == 0: if os.path.exists(args.model_name_or_path): print("init checkpoint from %s" % args.model_name_or_path) model = model_class.from_pretrained(args.model_name_or_path) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. # NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead. contexts = [examples[i]['context'] for i in range(len(examples))] questions = [examples[i]['question'] for i in range(len(examples))] tokenized_examples = tokenizer(questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length) # Let's label those examples! for i, tokenized_example in enumerate(tokenized_examples): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_example["input_ids"] cls_index = input_ids.index(tokenizer.cls_token_id) # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offsets = tokenized_example['offset_mapping'] # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_example['token_type_ids'] # One example can give several spans, this is the index of the example containing this span of text. sample_index = tokenized_example['overflow_to_sample'] answers = examples[sample_index]['answers'] answer_starts = examples[sample_index]['answer_starts'] # Start/end character index of the answer in the text. start_char = answer_starts[0] end_char = start_char + len(answers[0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != 1: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != 1: token_end_index -= 1 # Minus one more to reach actual text token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples[i]["start_positions"] = cls_index tokenized_examples[i]["end_positions"] = cls_index else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples[i][ "start_positions"] = token_start_index - 1 while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples[i]["end_positions"] = token_end_index + 1 return tokenized_examples if args.do_train: if args.train_file: train_ds = load_dataset(task_name, data_files=args.train_file) else: train_ds = load_dataset(task_name, splits='train') train_ds.map(prepare_train_features, batched=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64") }): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = CrossEntropyLossForSQuAD() global_step = 0 tic_train = time.time() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, start_positions, end_positions = batch logits = model(input_ids=input_ids, token_type_ids=token_type_ids) loss = criterion(logits, (start_positions, end_positions)) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch + 1, step + 1, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == num_training_steps: if rank == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) if global_step == num_training_steps: break def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. # NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead. contexts = [examples[i]['context'] for i in range(len(examples))] questions = [examples[i]['question'] for i in range(len(examples))] tokenized_examples = tokenizer(questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length) # For validation, there is no need to compute start and end positions for i, tokenized_example in enumerate(tokenized_examples): # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_example['token_type_ids'] # One example can give several spans, this is the index of the example containing this span of text. sample_index = tokenized_example['overflow_to_sample'] tokenized_examples[i]["example_id"] = examples[sample_index]['id'] # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples[i]["offset_mapping"] = [ (o if sequence_ids[k] == 1 else None) for k, o in enumerate(tokenized_example["offset_mapping"]) ] return tokenized_examples if args.do_predict and rank == 0: if args.predict_file: dev_ds = load_dataset(task_name, data_files=args.predict_file) else: dev_ds = load_dataset(task_name, splits='dev') dev_ds.map(prepare_validation_features, batched=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) evaluate(model, dev_data_loader, args)
def main(args): paddle.set_device('gpu' if args.n_gpus else 'cpu') paddle.seed(args.seed) world_size = dist.get_world_size() rank = dist.get_rank() if world_size > 1: dist.init_parallel_env() model = UnifiedTransformerLMHeadModel.from_pretrained( args.model_name_or_path) tokenizer = UnifiedTransformerTokenizer.from_pretrained( args.model_name_or_path) if world_size > 1: model = paddle.DataParallel(model) train_dataset = DialogueDataset(args.train_data_path, args.batch_size, tokenizer.pad_token_id, tokenizer.cls_token_id, args.sort_pool_size, args.seed, mode='train') train_dataloader = DataLoader(train_dataset, return_list=True, batch_size=None) valid_dataset = DialogueDataset(args.valid_data_path, args.batch_size, tokenizer.pad_token_id, tokenizer.cls_token_id, args.sort_pool_size, mode='valid') valid_dataloader = DataLoader(valid_dataset, return_list=True, batch_size=None) lr_scheduler = NoamDecay(1 / (args.warmup_steps * (args.lr**2)), args.warmup_steps) optimizer = AdamW(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ], grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) step = 0 total_time = 0.0 for epoch in range(args.epochs): if rank == 0: print('\nEpoch %d/%d' % (epoch + 1, args.epochs)) batch_start_time = time.time() for inputs in train_dataloader: step += 1 token_ids, type_ids, pos_ids, generation_mask, tgt_label, tgt_pos = inputs logits = model(token_ids, type_ids, pos_ids, generation_mask, tgt_pos) loss = F.cross_entropy(logits, tgt_label) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() total_time += (time.time() - batch_start_time) if rank == 0: if step % args.logging_steps == 0: ppl = paddle.exp(loss) print( 'step %d - loss: %.4f - ppl: %.4f - lr: %.7f - %.3fs/step' % (step, loss, ppl, optimizer.get_lr(), total_time / args.logging_steps)) total_time = 0.0 if step % args.save_steps == 0: evaluation(model, valid_dataloader) save_ckpt(model, tokenizer, args.save_dir, step) batch_start_time = time.time()
def run(args): if args.do_train: assert args.batch_size % args.gradient_accumulation_steps == 0, \ "Please make sure argmument `batch_size` must be divisible by `gradient_accumulation_steps`." paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() rank = paddle.distributed.get_rank() tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) set_seed(args) train_examples, dev_examples, test_examples = load_dataset( 'clue', 'cmrc2018', split=["train", "validation", "test"]) column_names = train_examples.column_names if rank == 0: if os.path.exists(args.model_name_or_path): logger.info("init checkpoint from %s" % args.model_name_or_path) model = AutoModelForQuestionAnswering.from_pretrained( args.model_name_or_path) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. # NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead. contexts = examples['context'] questions = examples['question'] tokenized_examples = tokenizer(questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples['token_type_ids'][i] # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples['answers'][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != 1: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != 1: token_end_index -= 1 token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples["start_positions"].append( token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append( token_end_index + 1) return tokenized_examples def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. #NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is # that HuggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead. contexts = examples['context'] questions = examples['question'] tokenized_examples = tokenizer(questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length, return_attention_mask=True) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] for i in range(len(tokenized_examples["input_ids"])): # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples['token_type_ids'][i] context_index = 1 # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append( examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_index and k != len(sequence_ids) - 1 else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples if args.do_train: args.batch_size = int(args.batch_size / args.gradient_accumulation_steps) with main_process_first(desc="train dataset map pre-processing"): train_ds = train_examples.map( prepare_train_features, batched=True, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, num_proc=args.num_proc, desc="Running tokenizer on train dataset") train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = DataCollatorWithPadding(tokenizer) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, return_list=True) with main_process_first(desc="evaluate dataset map pre-processing"): dev_ds = dev_examples.map( prepare_validation_features, batched=True, remove_columns=column_names, num_proc=args.num_proc, load_from_cache_file=args.overwrite_cache, desc="Running tokenizer on validation dataset") dev_ds_for_model = dev_ds.remove_columns( ["example_id", "offset_mapping", "attention_mask"]) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.eval_batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds_for_model, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, return_list=True) num_training_steps = int( args.max_steps / args.gradient_accumulation_steps) if args.max_steps >= 0 else int( len(train_data_loader) * args.num_train_epochs / args.gradient_accumulation_steps) warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = CrossEntropyLossForSQuAD() best_res = (0.0, 0.0) global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): start_positions = batch.pop("start_positions") end_positions = batch.pop("end_positions") logits = model(**batch) loss = criterion(logits, (start_positions, end_positions)) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: global_step += 1 optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: logger.info( "global step %d/%d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, num_training_steps, epoch, step + 1, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step >= num_training_steps: logger.info("best_result: %.2f/%.2f" % (best_res[0], best_res[1])) return em, f1 = evaluate(model, dev_examples, dev_ds, dev_data_loader, args) if paddle.distributed.get_rank() == 0 and em > best_res[0]: best_res = (em, f1) if args.save_best_model: output_dir = args.output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) logger.info("best_result: %.2f/%.2f" % (best_res[0], best_res[1])) if args.do_predict and rank == 0: test_ds = test_examples.map(prepare_validation_features, batched=True, remove_columns=column_names, num_proc=args.num_proc) test_ds_for_model = test_ds.remove_columns( ["example_id", "offset_mapping", "attention_mask"]) dev_batchify_fn = DataCollatorWithPadding(tokenizer) test_batch_sampler = paddle.io.BatchSampler( test_ds_for_model, batch_size=args.eval_batch_size, shuffle=False) batchify_fn = DataCollatorWithPadding(tokenizer) test_data_loader = DataLoader(dataset=test_ds_for_model, batch_sampler=test_batch_sampler, collate_fn=batchify_fn, return_list=True) evaluate(model, test_examples, test_ds, test_data_loader, args, do_eval=False)
def get_data_sample(self, task_id, train=True): if train: task = self.train_tasks[task_id] if task in self.preload_train_data: dataset = self.preload_train_data[task] else: dataset = MoleculeDataset(self.data_dir + self.dataset + "/new/" + str(task + 1), dataset=self.dataset) if self.update_s_q: s_data, q_data, s_data_eval, q_data_eval = sample_meta_datasets( dataset, self.dataset, task, self.n_shot_train, self.n_query) s_data_y = np.stack([i.y[0] for i in s_data.data_list]) q_data_y = np.stack([i.y[0] for i in q_data.data_list]) s_data_eval_y = np.stack( [i.y[0] for i in s_data_eval.data_list]) q_data_eval_y = np.stack( [i.y[0] for i in q_data_eval.data_list]) adapt_data = { 's_data': G.Graph.batch(s_data.data_list), 's_label': paddle.to_tensor(s_data_y), 'q_data': G.Graph.batch(q_data.data_list), 'q_label': paddle.to_tensor(q_data_y), 'label': paddle.to_tensor(np.concatenate([s_data_y, q_data_y])) } eval_data = { 's_data': G.Graph.batch(s_data_eval.data_list), 's_label': paddle.to_tensor(s_data_eval_y), 'q_data': G.Graph.batch(q_data_eval.data_list), 'q_label': paddle.to_tensor(q_data_eval_y), 'label': paddle.to_tensor( np.concatenate([s_data_eval_y, q_data_eval_y])) } else: s_data, q_data = sample_datasets(dataset, self.dataset, task, self.n_shot_train, self.n_query) s_data_y = np.stack([i.y[0] for i in s_data.data_list]) q_data_y = np.stack([i.y[0] for i in q_data.data_list]) adapt_data = { 'data': G.Graph.batch(s_data.data_list), 'label': paddle.to_tensor(s_data_y) } eval_data = { 'data': G.Graph.batch(q_data.data_list), 'label': paddle.to_tensor(q_data_y) } else: task = self.test_tasks[task_id] if 'train' in self.dataset: dataset = self.preload_test_data[task] if self.args.support_valid: val_dataset = self.preload_valid_data[task] data_name = self.dataset.replace('train', 'valid') else: val_dataset = self.preload_train_data[task] data_name = self.dataset s_data, _, q_data_adapt = sample_test_datasets( val_dataset, data_name, task, self.n_shot_test, self.n_query, self.update_step_test) s_data = self.loader_to_samples(s_data) q_loader = DataLoader(dataset, batch_size=self.n_query, shuffle=True, num_workers=0) q_loader_adapt = DataLoader(q_data_adapt, batch_size=self.n_query, shuffle=True, num_workers=0) adapt_data = { 's_data': s_data, 's_label': s_data.y, 'data_loader': q_loader_adapt } eval_data = { 's_data': s_data, 's_label': s_data.y, 'data_loader': q_loader } return adapt_data, eval_data if task in self.preload_test_data: dataset = self.preload_test_data[task] else: dataset = MoleculeDataset(self.data_dir + self.test_dataset + "/new/" + str(task + 1), dataset=self.test_dataset) s_data, q_data, q_data_adapt = sample_test_datasets( dataset, self.test_dataset, task, self.n_shot_test, self.n_query, self.update_step_test) s_data_y = np.stack([i.y[0] for i in s_data.data_list]) q_loader = q_data.get_data_loader(batch_size=self.n_query, shuffle=True, num_workers=1) q_loader_adapt = q_data_adapt.get_data_loader( batch_size=self.n_query, shuffle=True, num_workers=1) if self.update_s_q: adapt_data = { 's_data': G.Graph.batch(s_data.data_list), 's_label': paddle.to_tensor(s_data_y), 'data_loader': q_loader_adapt } eval_data = { 's_data': G.Graph.batch(s_data.data_list), 's_label': paddle.to_tensor(s_data_y), 'data_loader': q_loader } else: adapt_data = {'data_loader': [s_data] * self.update_step_test} eval_data = {'data_loader': q_loader} return adapt_data, eval_data
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() task_name = args.task_name.lower() dataset_class = TASK_CLASSES[task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) root = args.data_path set_seed(args) train_ds = dataset_class( tokenizer=tokenizer, root=root, doc_stride=args.doc_stride, max_query_length=args.max_query_length, max_seq_length=args.max_seq_length, mode='train') train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment Stack(), # unipue_id Stack(dtype="int64"), # start_pos Stack(dtype="int64") # end_pos ): [data for i, data in enumerate(fn(samples)) if i != 2] train_data_loader = DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) dev_ds = dataset_class( tokenizer=tokenizer, root=root, doc_stride=args.doc_stride, max_query_length=args.max_query_length, max_seq_length=args.max_seq_length, mode='dev') dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.batch_size, shuffle=False) dev_batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment Stack() # unipue_id ): fn(samples) dev_data_loader = DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) model = model_class.from_pretrained(args.model_name_or_path) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) lr_scheduler = paddle.optimizer.lr.LambdaDecay( args.learning_rate, lambda current_step, warmup_proportion=args.warmup_proportion, num_training_steps=args.max_steps if args.max_steps > 0 else (len(train_data_loader)*args.num_train_epochs): float( current_step) / float(max(1, warmup_proportion*num_training_steps)) if current_step < warmup_proportion*num_training_steps else max( 0.0, float(num_training_steps - current_step) / float( max(1, num_training_steps - warmup_proportion*num_training_steps)))) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) criterion = CrossEntropyLossForSQuAD() global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, start_positions, end_positions = batch logits = model(input_ids=input_ids, token_type_ids=segment_ids) loss = criterion(logits, (start_positions, end_positions)) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.save_steps == 0: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: evaluate(model, dev_data_loader, args, tokenizer)
def train(): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) train_dataset, dev_dataset = load_dataset( 'poetry', splits=('train', 'dev'), lazy=False) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example( tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len, noise_prob=args.noise_prob, use_random_noice=args.use_random_noice) train_dataset = train_dataset.map(trans_func) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # src_tids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # tgt_tids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) train_data_loader = DataLoader( dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_dataset = dev_dataset.map(trans_func) dev_data_loader = DataLoader( dataset=dev_dataset, batch_size=args.batch_size, collate_fn=batchify_fn, num_workers=0, return_list=True) label_num = model.word_emb.weight.shape[0] train_model = StackModel(model) if paddle.distributed.get_world_size() > 1: # All 'forward' outputs derived from the module parameters using in DataParallel # must participate in the calculation of losses and subsequent gradient calculations. # So we use StackModel here to make the model only output loss in its 'forward' function. train_model = paddle.DataParallel(train_model) max_steps = len(train_data_loader) * args.num_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=nn.ClipGradByGlobalNorm(1.0), apply_decay_param_fun=lambda x: x in decay_params) rouge1 = Rouge1() rouge2 = Rouge2() global_step = 1 tic_train = time.time() for epoch in range(args.num_epochs): for step, batch in enumerate(train_data_loader, start=1): (src_ids, src_tids, src_pids, tgt_ids, tgt_tids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, _) = batch # import pdb; pdb.set_trace() if args.label_smooth > 0.: tgt_labels = nn.functional.label_smooth( nn.functional.one_hot(tgt_labels, label_num), epsilon=args.label_smooth) tgt_pos = paddle.nonzero(attn_ids == attn_id) loss = train_model(src_ids, src_tids, src_pids, tgt_ids, tgt_tids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, tgt_pos) if global_step % args.logging_steps == 0: if paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, lr: %.3e" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train), lr_scheduler.get_lr())) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 and paddle.distributed.get_rank( ) == 0: evaluate(model, dev_data_loader, tokenizer, rouge1, rouge2, attn_id, tgt_type_id, args) output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) global_step += 1
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Create dataset, tokenizer and dataloader. if args.dataset == "peoples_daily_ner": raw_datasets = load_dataset(args.dataset) else: raw_datasets = load_dataset(args.dataset) AutoForTokenClassification, AutoTokenizer = MODEL_CLASSES[args.model_type] tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) train_ds = raw_datasets['train'] label_list = train_ds.features['ner_tags'].feature.names label_num = len(label_list) no_entity_id = 0 def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples['tokens'], max_seq_len=args.max_seq_length, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, return_length=True) labels = [] for i, label in enumerate(examples['ner_tags']): label_ids = label if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids): label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) - 2] label_ids = [no_entity_id] + label_ids + [no_entity_id] label_ids += [no_entity_id] * ( len(tokenized_inputs['input_ids'][i]) - len(label_ids)) labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs train_ds = train_ds.select(range(len(train_ds) - 1)) train_ds = train_ds.map(tokenize_and_align_labels, batched=True) ignore_label = -100 batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32' ), # segment 'seq_len': Stack(dtype='int64'), # seq_len 'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64') # label }): fn(samples) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_data_loader = DataLoader(dataset=train_ds, collate_fn=batchify_fn, num_workers=0, batch_sampler=train_batch_sampler, return_list=True) test_ds = raw_datasets['test'] test_ds = test_ds.select(range(len(test_ds) - 1)) test_ds = test_ds.map(tokenize_and_align_labels, batched=True) test_data_loader = DataLoader(dataset=test_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) if args.dataset == "peoples_daily_ner": dev_ds = raw_datasets['validation'] dev_ds = dev_ds.select(range(len(dev_ds) - 1)) dev_ds = dev_ds.map(tokenize_and_align_labels, batched=True) dev_data_loader = DataLoader(dataset=dev_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = AutoForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) global_step = 0 last_step = args.num_train_epochs * len(train_data_loader) tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, _, labels = batch logits = model(input_ids, token_type_ids) loss = loss_fct(logits, labels) avg_loss = paddle.mean(loss) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, avg_loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() avg_loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == num_training_steps: if paddle.distributed.get_rank() == 0: if args.dataset == "peoples_daily_ner": evaluate(model, loss_fct, metric, dev_data_loader, label_num, "valid") evaluate(model, loss_fct, metric, test_data_loader, label_num, "test") paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) if global_step >= num_training_steps: return
USE_GPU = False # whether use GPU to run model # define a random dataset class RandomDataset(Dataset): def __init__(self, num_samples): self.num_samples = num_samples def __getitem__(self, idx): image = np.random.random([IMAGE_SIZE]).astype('float32') label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64') return image, label def __len__(self): return self.num_samples paddle.enable_static() dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=2) for e in range(EPOCH_NUM): for i, (image, label) in enumerate(loader()): print(type(image)) print(image.__array__())
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() dataset_class, metric_class = TASK_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_dataset = dataset_class.get_datasets(["train"]) tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_dataset.get_labels(), max_seq_length=args.max_seq_length) train_dataset = train_dataset.apply(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment Stack(), # length Stack(dtype="int64" if train_dataset.get_labels() else "float32") # label ): [data for i, data in enumerate(fn(samples)) if i != 2] train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) if args.task_name == "mnli": dev_dataset_matched, dev_dataset_mismatched = dataset_class.get_datasets( ["dev_matched", "dev_mismatched"]) dev_dataset_matched = dev_dataset_matched.apply(trans_func, lazy=True) dev_dataset_mismatched = dev_dataset_mismatched.apply(trans_func, lazy=True) dev_batch_sampler_matched = paddle.io.BatchSampler( dev_dataset_matched, batch_size=args.batch_size, shuffle=False) dev_data_loader_matched = DataLoader( dataset=dev_dataset_matched, batch_sampler=dev_batch_sampler_matched, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_batch_sampler_mismatched = paddle.io.BatchSampler( dev_dataset_mismatched, batch_size=args.batch_size, shuffle=False) dev_data_loader_mismatched = DataLoader( dataset=dev_dataset_mismatched, batch_sampler=dev_batch_sampler_mismatched, collate_fn=batchify_fn, num_workers=0, return_list=True) else: dev_dataset = dataset_class.get_datasets(["dev"]) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 1 if train_dataset.get_labels() == None else len( train_dataset.get_labels()) model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_classes) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_train_epochs) warmup_steps = args.warmup_steps if args.warmup_steps > 0 else (int( math.floor(num_training_steps * args.warmup_proportion))) lr_scheduler = paddle.optimizer.lr.LambdaDecay( args.learning_rate, lambda current_step, num_warmup_steps=warmup_steps, num_training_steps= num_training_steps: float(current_step) / float( max(1, num_warmup_steps)) if current_step < num_warmup_steps else max( 0.0, float(num_training_steps - current_step) / float( max(1, num_training_steps - num_warmup_steps)))) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) loss_fct = paddle.nn.loss.CrossEntropyLoss() if train_dataset.get_labels( ) else paddle.nn.loss.MSELoss() metric = metric_class() global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = loss_fct(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.logging_steps == 0: logger.info( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0: tic_eval = time.time() if args.task_name == "mnli": evaluate(model, loss_fct, metric, dev_data_loader_matched) evaluate(model, loss_fct, metric, dev_data_loader_mismatched) logger.info("eval done total : %s s" % (time.time() - tic_eval)) else: evaluate(model, loss_fct, metric, dev_data_loader) logger.info("eval done total : %s s" % (time.time() - tic_eval)) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "%s_ft_model_%d.pdparams" % (args.task_name, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() train_dataset, dev_dataset = ppnlp.datasets.MSRA_NER.get_datasets( ["train", "dev"]) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_dataset.get_labels() label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(convert_example, tokenizer=tokenizer, label_list=label_list, no_entity_id=label_num - 1, max_seq_length=args.max_seq_length) train_dataset = train_dataset.apply(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) ignore_label = -100 batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment Stack(), # length Pad(axis=0, pad_val=ignore_label) # label ): fn(samples) train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) dev_data_loader = DataLoader(dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) lr_scheduler = paddle.optimizer.lr.LambdaDecay( args.learning_rate, lambda current_step, num_warmup_steps=args.warmup_steps, num_training_steps=args.max_steps if args.max_steps > 0 else (len(train_data_loader) * args.num_train_epochs): float( current_step) / float(max(1, num_warmup_steps)) if current_step < num_warmup_steps else max( 0.0, float(num_training_steps - current_step) / float( max(1, num_training_steps - num_warmup_steps)))) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(int(math.ceil((label_num + 1) / 2.0)), "IOB") global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): input_ids, segment_ids, length, labels = batch logits = model(input_ids, segment_ids) loss = loss_fct(logits.reshape([-1, label_num]), labels.reshape([-1])) avg_loss = paddle.mean(loss) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, avg_loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() avg_loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.save_steps == 0: evaluate(model, loss_fct, metric, dev_data_loader, label_num) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) global_step += 1
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() dataset_class, metric_class = TASK_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_ds = dataset_class.get_datasets(['train']) tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.get_labels(), max_seq_length=args.max_seq_length) train_ds = train_ds.apply(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment Stack(), # length Stack(dtype="int64" if train_ds.get_labels() else "float32") # label ): [data for i, data in enumerate(fn(samples)) if i != 2] train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) if args.task_name == "mnli": dev_dataset_matched, dev_dataset_mismatched = dataset_class.get_datasets( ["dev_matched", "dev_mismatched"]) dev_dataset_matched = dev_dataset_matched.apply(trans_func, lazy=True) dev_dataset_mismatched = dev_dataset_mismatched.apply(trans_func, lazy=True) dev_batch_sampler_matched = paddle.io.BatchSampler( dev_dataset_matched, batch_size=args.batch_size, shuffle=False) dev_data_loader_matched = DataLoader( dataset=dev_dataset_matched, batch_sampler=dev_batch_sampler_matched, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_batch_sampler_mismatched = paddle.io.BatchSampler( dev_dataset_mismatched, batch_size=args.batch_size, shuffle=False) dev_data_loader_mismatched = DataLoader( dataset=dev_dataset_mismatched, batch_sampler=dev_batch_sampler_mismatched, collate_fn=batchify_fn, num_workers=0, return_list=True) else: dev_dataset = dataset_class.get_datasets(["dev"]) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_labels = 1 if train_ds.get_labels() == None else len( train_ds.get_labels()) model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_labels) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) # Step1: Initialize a dictionary to save the weights from the origin BERT model. origin_weights = {} for name, param in model.named_parameters(): origin_weights[name] = param # Step2: Convert origin model to supernet. sp_config = supernet(expand_ratio=args.width_mult_list) model = Convert(sp_config).convert(model) # Use weights saved in the dictionary to initialize supernet. utils.set_state_dict(model, origin_weights) del origin_weights # Step3: Define teacher model. teacher_model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_labels) # Step4: Config about distillation. mapping_layers = ['bert.embeddings'] for idx in range(model.bert.config['num_hidden_layers']): mapping_layers.append('bert.encoder.layers.{}'.format(idx)) default_distill_config = { 'lambda_distill': 0.1, 'teacher_model': teacher_model, 'mapping_layers': mapping_layers, } distill_config = DistillConfig(**default_distill_config) # Step5: Config in supernet training. ofa_model = OFA(model, distill_config=distill_config, elastic_order=['width']) criterion = paddle.nn.loss.CrossEntropyLoss() if train_ds.get_labels( ) else paddle.nn.loss.MSELoss() metric = metric_class() if args.task_name == "mnli": dev_data_loader = (dev_data_loader_matched, dev_data_loader_mismatched) # Step6: Calculate the importance of neurons and head, # and then reorder them according to the importance. head_importance, neuron_importance = utils.compute_neuron_head_importance( args.task_name, ofa_model.model, dev_data_loader, loss_fct=criterion, num_layers=model.bert.config['num_hidden_layers'], num_heads=model.bert.config['num_attention_heads']) reorder_neuron_head(ofa_model.model, head_importance, neuron_importance) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=ofa_model.model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in ofa_model.model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): # Step7: Set current epoch and task. ofa_model.set_epoch(epoch) ofa_model.set_task('width') for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, labels = batch for width_mult in args.width_mult_list: # Step8: Broadcast supernet config from width_mult, # and use this config in supernet training. net_config = apply_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) logits, teacher_logits = ofa_model(input_ids, segment_ids, attention_mask=[None, None]) rep_loss = ofa_model.calc_distill_loss() if args.task_name == 'sts-b': logit_loss = 0.0 else: logit_loss = soft_cross_entropy(logits, teacher_logits.detach()) loss = rep_loss + args.lambda_logit * logit_loss loss.backward() optimizer.step() lr_scheduler.step() ofa_model.model.clear_gradients() if global_step % args.logging_steps == 0: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0: if args.task_name == "mnli": evaluate(teacher_model, criterion, metric, dev_data_loader_matched, width_mult=100) evaluate(teacher_model, criterion, metric, dev_data_loader_mismatched, width_mult=100) else: evaluate(teacher_model, criterion, metric, dev_data_loader, width_mult=100) for idx, width_mult in enumerate(args.width_mult_list): net_config = apply_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) tic_eval = time.time() if args.task_name == "mnli": acc = evaluate(ofa_model, criterion, metric, dev_data_loader_matched, width_mult) evaluate(ofa_model, criterion, metric, dev_data_loader_mismatched, width_mult) print("eval done total : %s s" % (time.time() - tic_eval)) else: acc = evaluate(ofa_model, criterion, metric, dev_data_loader, width_mult) print("eval done total : %s s" % (time.time() - tic_eval)) if (not args.n_gpu > 1 ) or paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
def main_worker(gpu, ngpus_per_node, args): args.gpu = dist.get_rank() # None logger = get_logger('dcq', log_file='{}/workerlog.{}'.format(args.save, args.gpu), level='info', rank=args.gpu) # suppress printing if not master if args.distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: logger.info("Use GPU: {} for training".format(args.gpu)) if args.distributed: dist.init_parallel_env() # create model logger.info("=> creating model '{}'".format(args.arch)) if args.arch in models.__dict__.keys(): backbone = models.__dict__[args.arch] else: raise NotImplementedError model = DCQ( backbone, args.feat_dim, args.queue_size, args.dcq_momentum, args.scale, args.margin, ) if args.distributed: model = paddle.DataParallel(model) criterion = paddle.nn.loss.CrossEntropyLoss(reduction='mean') optimizer = paddle.optimizer.Momentum(learning_rate=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, parameters=model.parameters()) if args.resume: if os.path.isfile(args.resume + '.pdparams'): print("=> loading checkpoint '{}'".format(args.resume)) with open(args.resume + '.state.pickle', 'rb') as fin: state = pickle.load(fin) args.start_epoch = state['epoch'] state_dict = paddle.load(args.resume + '.pdparams') print(model.set_state_dict(state_dict)) optimizer_state = paddle.load(args.resume + '.pdopt') optimizer.set_state_dict(optimizer_state) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, state['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # Data loading code augmentation = build_aug(args) if args.filelist is not None: roots = args.data.split(';') anno_files = args.filelist.split(';') probs = args.dataprob.split(';') assert len(roots) == len(anno_files) assert len(probs) == len(anno_files) datasets = [] for root, anno_file in zip(roots, anno_files): datasets.append(ImageDataset(root=root, anno_file=anno_file)) probs = [float(v) for v in probs] else: raise NotImplementedError data_processing = Processing(transform=augmentation) train_dataset = Sampler(datasets, probs, samples_per_epoch=args.iter_per_epoch * args.batch_size, processing=data_processing, k=2, sampling_base=args.sampling_base) if args.sampling_base == 'image': train_sampler = paddle.io.DistributedBatchSampler(train_dataset, args.batch_size, shuffle=True, drop_last=True) train_loader = DataLoader(train_dataset, num_workers=1, batch_sampler=train_sampler) else: train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=1, drop_last=True) print( f'{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}: DataLoader is ready.' ) for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args, logger) if args.gpu == 0 and epoch > args.schedule[0]: # skip saving the queue state_dict = {} model_state_dict = model.state_dict() for key in model_state_dict: # we don't need to save the queue if 'queue' not in key: state_dict[key] = model_state_dict[key] save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': state_dict, 'optimizer': optimizer.state_dict(), }, filename='{}/face_checkpoint_{:04d}'.format(args.save, epoch))