def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Create dataset, tokenizer and dataloader. if args.dataset == "peoples_daily_ner": raw_datasets = load_dataset(args.dataset) else: raw_datasets = load_dataset(args.dataset) AutoForTokenClassification, AutoTokenizer = MODEL_CLASSES[args.model_type] tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) train_ds = raw_datasets['train'] label_list = train_ds.features['ner_tags'].feature.names label_num = len(label_list) no_entity_id = 0 def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples['tokens'], max_seq_len=args.max_seq_length, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, return_length=True) labels = [] for i, label in enumerate(examples['ner_tags']): label_ids = label if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids): label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) - 2] label_ids = [no_entity_id] + label_ids + [no_entity_id] label_ids += [no_entity_id] * ( len(tokenized_inputs['input_ids'][i]) - len(label_ids)) labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs train_ds = train_ds.select(range(len(train_ds) - 1)) column_names = train_ds.column_names train_ds = train_ds.map(tokenize_and_align_labels, batched=True, remove_columns=column_names) ignore_label = -100 batchify_fn = DataCollatorForTokenClassification( tokenizer=tokenizer, label_pad_token_id=ignore_label) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_data_loader = DataLoader(dataset=train_ds, collate_fn=batchify_fn, num_workers=0, batch_sampler=train_batch_sampler, return_list=True) test_ds = raw_datasets['test'] test_ds = test_ds.select(range(len(test_ds) - 1)) test_ds = test_ds.map(tokenize_and_align_labels, batched=True, remove_columns=column_names) test_data_loader = DataLoader(dataset=test_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) if args.dataset == "peoples_daily_ner": dev_ds = raw_datasets['validation'] dev_ds = dev_ds.select(range(len(dev_ds) - 1)) dev_ds = dev_ds.map(tokenize_and_align_labels, batched=True, remove_columns=column_names) dev_data_loader = DataLoader(dataset=dev_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = AutoForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) global_step = 0 last_step = args.num_train_epochs * len(train_data_loader) tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 logits = model(batch['input_ids'], batch['token_type_ids']) loss = loss_fct(logits, batch['labels']) avg_loss = paddle.mean(loss) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, avg_loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() avg_loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == num_training_steps: if paddle.distributed.get_rank() == 0: if args.dataset == "peoples_daily_ner": evaluate(model, loss_fct, metric, dev_data_loader, label_num, "valid") evaluate(model, loss_fct, metric, test_data_loader, label_num, "test") paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) if global_step >= num_training_steps: return
def run(args): if args.do_train: assert args.batch_size % args.gradient_accumulation_steps == 0, \ "Please make sure argmument `batch_size` must be divisible by `gradient_accumulation_steps`." max_seq_length = args.max_seq_length max_num_choices = 4 def preprocess_function(examples, do_predict=False): def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length): """Truncates a sequence tuple in place to the maximum length.""" # This is a simple heuristic which will always truncate the longer # sequence one token at a time. This makes more sense than # truncating an equal percent of tokens from each, since if one # sequence is very short then each token that's truncated likely # contains more information than a longer sequence. while True: total_length = len(tokens_a) + len(tokens_b) + len(tokens_c) if total_length <= max_length: break if len(tokens_a) >= len(tokens_b) and len(tokens_a) >= len( tokens_c): tokens_a.pop() elif len(tokens_b) >= len(tokens_a) and len(tokens_b) >= len( tokens_c): tokens_b.pop() else: tokens_c.pop() num_examples = len(examples.data["question"]) if do_predict: result = {"input_ids": [], "token_type_ids": []} else: result = {"input_ids": [], "token_type_ids": [], "labels": []} for idx in range(num_examples): text = '\n'.join(examples.data["context"][idx]).lower() question = examples.data["question"][idx].lower() choice_list = examples.data["choice"][idx] choice_list = [choice.lower() for choice in choice_list][:max_num_choices] if not do_predict: answer = examples.data["answer"][idx].lower() label = choice_list.index(answer) tokens_t = tokenizer.tokenize(text) tokens_q = tokenizer.tokenize(question) tokens_t_list = [] tokens_c_list = [] # Pad each new example for axis=1, [batch_size, num_choices, seq_len] while len(choice_list) < max_num_choices: choice_list.append('无效答案') for choice in choice_list: tokens_c = tokenizer.tokenize(choice.lower()) _truncate_seq_tuple(tokens_t, tokens_q, tokens_c, max_seq_length - 4) tokens_c = tokens_q + ["[SEP]"] + tokens_c tokens_t_list.append(tokens_t) tokens_c_list.append(tokens_c) new_data = tokenizer( tokens_t_list, text_pair=tokens_c_list, is_split_into_words=True) # Pad each new example for axis=2 of [batch_size, num_choices, seq_len], # because length of each choice could be different. input_ids = Pad( axis=0, pad_val=tokenizer.pad_token_id)(new_data["input_ids"]) token_type_ids = Pad( axis=0, pad_val=tokenizer.pad_token_id)(new_data["token_type_ids"]) # Final shape of input_ids: [batch_size, num_choices, seq_len] result["input_ids"].append(input_ids) result["token_type_ids"].append(token_type_ids) if not do_predict: result["labels"].append([label]) if (idx + 1) % 1000 == 0: logger.info("%d samples have been processed." % (idx + 1)) return result paddle.set_device(args.device) set_seed(args) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) model = AutoModelForMultipleChoice.from_pretrained( args.model_name_or_path, num_choices=max_num_choices) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) train_ds, dev_ds, test_ds = load_dataset( "clue", "c3", split=["train", "validation", "test"]) if args.do_train: args.batch_size = int(args.batch_size / args.gradient_accumulation_steps) column_names = train_ds.column_names with main_process_first(desc="train dataset map pre-processing"): train_ds = train_ds.map( preprocess_function, batched=True, batch_size=len(train_ds), num_proc=args.num_proc, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on train dataset") batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=1, pad_val=tokenizer.pad_token_type_id), # segment 'labels': Stack(dtype="int64") # label }): fn(samples) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_data_loader = paddle.io.DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) with main_process_first(desc="evaluate dataset map pre-processing"): dev_ds = dev_ds.map(preprocess_function, batched=True, batch_size=len(dev_ds), remove_columns=column_names, num_proc=args.num_proc, load_from_cache_file=args.overwrite_cache, desc="Running tokenizer on validation dataset") dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.eval_batch_size, shuffle=False) dev_data_loader = paddle.io.DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, return_list=True) num_training_steps = int( args.max_steps / args.gradient_accumulation_steps) if args.max_steps >= 0 else int( len(train_data_loader) * args.num_train_epochs / args.gradient_accumulation_steps) warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] grad_clip = paddle.nn.ClipGradByGlobalNorm(args.max_grad_norm) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=grad_clip) loss_fct = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() model.train() global_step = 0 best_acc = 0.0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): input_ids, segment_ids, label = batch logits = model(input_ids=input_ids, token_type_ids=segment_ids) loss = loss_fct(logits, label) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: global_step += 1 optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: logger.info( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step + 1, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step >= num_training_steps: logger.info("best_result: %.2f" % (best_acc * 100)) return tic_eval = time.time() acc = evaluation(model, loss_fct, dev_data_loader, metric) logger.info("eval acc: %.5f, eval done total : %s s" % (acc, time.time() - tic_eval)) if paddle.distributed.get_rank() == 0 and acc > best_acc: best_acc = acc if args.save_best_model: model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) logger.info("best_result: %.2f" % (best_acc * 100)) if args.do_predict: column_names = test_ds.column_names test_ds = test_ds.map(partial( preprocess_function, do_predict=True), batched=True, batch_size=len(test_ds), remove_columns=column_names, num_proc=args.num_proc) # Serveral samples have more than four choices. test_batch_sampler = paddle.io.BatchSampler( test_ds, batch_size=1, shuffle=False) batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=1, pad_val=tokenizer.pad_token_type_id), # segment }): fn(samples) test_data_loader = paddle.io.DataLoader( dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn, return_list=True) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) f = open(os.path.join(args.output_dir, "c311_predict.json"), 'w') result = {} idx = 0 for step, batch in enumerate(test_data_loader): input_ids, segment_ids = batch with paddle.no_grad(): logits = model(input_ids, segment_ids) preds = paddle.argmax(logits, axis=1).numpy().tolist() for pred in preds: result[str(idx)] = pred j = json.dumps({"id": idx, "label": pred}) f.write(j + "\n") idx += 1
def do_train(args): # Initialization for the parallel enviroment paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() worker_index = paddle.distributed.get_rank() worker_num = paddle.distributed.get_world_size() # Set the random seed for the training process set_seed(args) worker_init = WorkerInitObj(args.seed + worker_index) # Get the model class and tokenizer class args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) # Define the pretrain model and metric pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) if args.model_name_or_path in pretrained_models_list: model = BigBirdForPretraining( BigBirdModel(**model_class.pretrained_init_configuration[ args.model_name_or_path])) else: model = BigBirdForPretraining.from_pretrained(args.model_name_or_path) # Get bigbird config for generate random attention mask config = getattr(model, BigBirdForPretraining.base_model_prefix).config criterion = BigBirdPretrainingCriterion(config["vocab_size"], args.use_nsp) if worker_num > 1: model = paddle.DataParallel(model) # Define learing_rate scheduler and optimizer lr_scheduler = LinearDecayWithWarmup(args.learning_rate, args.max_steps, args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) global_step = 0 tic_train = time.time() for epoch in range(args.epochs): files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) ] files.sort() num_files = len(files) for f_id in range(num_files): train_data_loader = create_dataloader(files[f_id], tokenizer, worker_init, args.batch_size, args.max_encoder_length, args.max_pred_length, config) for step, batch in enumerate(train_data_loader): global_step += 1 (input_ids, segment_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights, next_sentence_labels, masked_lm_scale) = batch[:7] rand_mask_idx_list = batch[7:] prediction_scores, seq_relationship_score = model( input_ids=input_ids, token_type_ids=segment_ids, rand_mask_idx_list=rand_mask_idx_list, masked_positions=masked_lm_positions) loss = criterion(prediction_scores, seq_relationship_score, masked_lm_ids, next_sentence_labels, masked_lm_scale, masked_lm_weights) if global_step % args.logging_steps == 0 and worker_index == 0: logger.info( "global step %d, epoch: %d, lr: %.10f, loss: %f, speed: %.2f step/s" % (global_step, epoch, optimizer.get_lr(), loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0: if worker_index == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) paddle.save( optimizer.state_dict(), os.path.join(output_dir, "model_state.pdopt")) if global_step >= args.max_steps: del train_data_loader return del train_data_loader
def main(args): paddle.seed(12345) # load config config = load_yaml(args.config_yaml) config["config_abs_dir"] = args.abs_dir # load static model class static_model_class = load_static_model_class(config) input_data = static_model_class.create_feeds() input_data_names = [data.name for data in input_data] fetch_vars = static_model_class.net(input_data) #infer_target_var = model.infer_target_var logger.info("cpu_num: {}".format(os.getenv("CPU_NUM"))) static_model_class.create_optimizer() use_gpu = config.get("runner.use_gpu", True) use_auc = config.get("runner.use_auc", False) train_data_dir = config.get("runner.train_data_dir", None) epochs = config.get("runner.epochs", None) print_interval = config.get("runner.print_interval", None) model_save_path = config.get("runner.model_save_path", "model_output") model_init_path = config.get("runner.model_init_path", None) batch_size = config.get("runner.train_batch_size", None) os.environ["CPU_NUM"] = str(config.get("runner.thread_num", 1)) logger.info("**************common.configs**********") logger.info( "use_gpu: {}, train_data_dir: {}, epochs: {}, print_interval: {}, model_save_path: {}". format(use_gpu, train_data_dir, epochs, print_interval, model_save_path)) logger.info("**************common.configs**********") place = paddle.set_device('gpu' if use_gpu else 'cpu') exe = paddle.static.Executor(place) # initialize exe.run(paddle.static.default_startup_program()) last_epoch_id = config.get("last_epoch", -1) train_dataloader = create_data_loader(config=config, place=place) for epoch_id in range(last_epoch_id + 1, epochs): epoch_begin = time.time() interval_begin = time.time() train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() if use_auc: reset_auc() for batch_id, batch_data in enumerate(train_dataloader()): train_reader_cost += time.time() - reader_start train_start = time.time() fetch_batch_var = exe.run( program=paddle.static.default_main_program(), feed=dict(zip(input_data_names, batch_data)), fetch_list=[var for _, var in fetch_vars.items()]) train_run_cost += time.time() - train_start total_samples += batch_size if batch_id % print_interval == 0: metric_str = "" for var_idx, var_name in enumerate(fetch_vars): metric_str += "{}: {}, ".format(var_name, fetch_batch_var[var_idx]) logger.info( "epoch: {}, batch_id: {}, ".format(epoch_id, batch_id) + metric_str + "avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec". format(train_reader_cost / print_interval, ( train_reader_cost + train_run_cost) / print_interval, total_samples / print_interval, total_samples / ( train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() metric_str = "" for var_idx, var_name in enumerate(fetch_vars): metric_str += "{}: {}, ".format(var_name, fetch_batch_var[var_idx]) logger.info("epoch: {} done, ".format(epoch_id) + metric_str + "epoch time: {:.2f} s".format(time.time() - epoch_begin)) save_static_model( paddle.static.default_main_program(), model_save_path, epoch_id, prefix='rec_static')
* @file test.py * @author [email protected] * @date 2020-12-30 15:53 * @brief * **************************************************************************/ """ import sys import numpy as np import paddle from paddle.distributed import ReduceOp from paddle.distributed import init_parallel_env from dist_utils import run_priority types = [np.float16, np.float32, np.float64, np.int32, np.int64] paddle.set_device('gpu:%d' % paddle.distributed.ParallelEnv().dev_id) init_parallel_env() @run_priority(level='P0') def test_all_reduce_max(): """all reduce max""" for t in types: if paddle.distributed.ParallelEnv().local_rank == 0: np_data = np.array([[4, 5, 6], [4, 5, 6]]).astype(t) else: np_data = np.array([[1, 2, 3], [1, 2, 3]]).astype(t) data = paddle.to_tensor(np_data) paddle.distributed.all_reduce(data, ReduceOp.MAX) out = data.numpy() assert out[0][0] == 4
def do_predict(args): paddle.set_device("gpu" if args.use_gpu else "cpu") train_ds, predict_ds = load_dataset( 'msra_ner', splits=('train', 'test'), lazy=False) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial( tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) ignore_label = -100 batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment 'seq_len': Stack(), 'labels': Pad(axis=0, pad_val=ignore_label) # label }): fn(samples) raw_data = predict_ds.data id2label = dict(enumerate(predict_ds.label_list)) predict_ds = predict_ds.map(trans_func) predict_data_loader = DataLoader( dataset=predict_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) model = BertForTokenClassification.from_pretrained( args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) model.eval() pred_list = [] len_list = [] for step, batch in enumerate(predict_data_loader): input_ids, token_type_ids, length, labels = batch logits = model(input_ids, token_type_ids) pred = paddle.argmax(logits, axis=-1) pred_list.append(pred.numpy()) len_list.append(length.numpy()) preds = parse_decodes(raw_data, id2label, pred_list, len_list) file_path = "results.txt" with open(file_path, "w", encoding="utf8") as fout: fout.write("\n".join(preds)) # Print some examples print( "The results have been saved in the file: %s, some examples are shown below: " % file_path) print("\n".join(preds[:10]))
#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import absolute_import from __future__ import print_function from __future__ import division import paddle import paddle.fluid as fluid import numpy as np from zhusuan.distributions.base import * import unittest device = paddle.set_device('gpu') paddle.disable_static(device) class Dist(Distribution): def __init__(self, dtype='float32', param_dtype='float32', group_ndims=0, shape_fully_defined=True, **kwargs): super(Dist, self).__init__(dtype, param_dtype, is_continuous=True, is_reparameterized=True, group_ndims=group_ndims, **kwargs) self._shape_fully_defined = shape_fully_defined
def main(): parser = PdArgumentParser( (ModelArguments, DataArguments, PreTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() set_seed(training_args) paddle.set_device(training_args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() training_args.eval_iters = 10 training_args.test_iters = training_args.eval_iters * 10 # Log model and data config training_args.print_config(model_args, "Model") training_args.print_config(data_args, "Data") # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) # if last_checkpoint is None and len( # os.listdir(training_args.output_dir)) > 1: # raise ValueError( # f"Output directory ({training_args.output_dir}) already exists and is not empty. " # "Use --overwrite_output_dir to overcome.") if last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) base_class, model_class, criterion_class, tokenizer_class = MODEL_CLASSES[ model_args.model_type] pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) if model_args.model_name_or_path in pretrained_models_list: model_config = model_class.pretrained_init_configuration[ model_args.model_name_or_path] model_config["hidden_dropout_prob"] = model_args.hidden_dropout_prob model_config[ "attention_probs_dropout_prob"] = model_args.attention_probs_dropout_prob model = model_class(base_class(**model_config)) else: model = model_class.from_pretrained( model_args.model_name_or_path, hidden_dropout_prob=model_args.hidden_dropout_prob, attention_probs_dropout_prob=model_args. attention_probs_dropout_prob) class CriterionWrapper(paddle.nn.Layer): """ """ def __init__(self): """CriterionWrapper """ super(CriterionWrapper, self).__init__() self.criterion = criterion_class() def forward(self, output, labels): """forward function Args: output (tuple): prediction_scores, seq_relationship_score labels (tuple): masked_lm_labels, next_sentence_labels Returns: Tensor: final loss. """ prediction_scores, seq_relationship_score = output masked_lm_labels, next_sentence_labels = labels lm_loss, sop_loss = self.criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels) loss = lm_loss + sop_loss return loss # Create the learning_rate sheduler and optimizer if training_args.decay_steps is None: training_args.decay_steps = training_args.max_steps warmup_steps = training_args.warmup_ratio * training_args.max_steps lr_scheduler = LinearAnnealingWithWarmupDecay( training_args.learning_rate, training_args.min_learning_rate, warmup_step=warmup_steps, decay_step=training_args.decay_steps) data_file = get_train_data_file(data_args) tokenizer = tokenizer_class.from_pretrained(model_args.model_name_or_path) train_dataset, eval_dataset, test_dataset, data_collator = create_pretrained_dataset( data_args, training_args, data_file, tokenizer) trainer = PretrainingTrainer( model=model, criterion=CriterionWrapper(), args=training_args, data_collator=data_collator, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, optimizers=(None, lr_scheduler), tokenizer=tokenizer, ) checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint # Training if training_args.do_train: train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model() trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() if training_args.do_predict: test_ret = trainer.predict(test_dataset) trainer.log_metrics("test", test_ret.metrics)
def main(args): paddle.seed(12345) # load config config = load_yaml(args.config_yaml) config["yaml_path"] = args.config_yaml config["config_abs_dir"] = args.abs_dir # modify config from command if args.opt: for parameter in args.opt: parameter = parameter.strip() key, value = parameter.split("=") if type(config.get(key)) is int: value = int(value) if type(config.get(key)) is float: value = float(value) if type(config.get(key)) is bool: value = (True if value.lower() == "true" else False) config[key] = value # load static model class static_model_class = load_static_model_class(config) input_data = static_model_class.create_feeds() input_data_names = [data.name for data in input_data] fetch_vars = static_model_class.net(input_data) #infer_target_var = model.infer_target_var logger.info("cpu_num: {}".format(os.getenv("CPU_NUM"))) use_gpu = config.get("runner.use_gpu", True) use_xpu = config.get("runner.use_xpu", False) use_auc = config.get("runner.use_auc", False) use_visual = config.get("runner.use_visual", False) use_inference = config.get("runner.use_inference", False) auc_num = config.get("runner.auc_num", 1) train_data_dir = config.get("runner.train_data_dir", None) epochs = config.get("runner.epochs", None) print_interval = config.get("runner.print_interval", None) model_save_path = config.get("runner.model_save_path", "model_output") model_init_path = config.get("runner.model_init_path", None) batch_size = config.get("runner.train_batch_size", None) reader_type = config.get("runner.reader_type", "DataLoader") use_fleet = config.get("runner.use_fleet", False) use_save_data = config.get("runner.use_save_data", False) os.environ["CPU_NUM"] = str(config.get("runner.thread_num", 1)) logger.info("**************common.configs**********") logger.info( "use_gpu: {}, use_xpu: {}, use_visual: {}, train_batch_size: {}, train_data_dir: {}, epochs: {}, print_interval: {}, model_save_path: {}" .format(use_gpu, use_xpu, use_visual, batch_size, train_data_dir, epochs, print_interval, model_save_path)) logger.info("**************common.configs**********") if use_xpu: xpu_device = 'xpu:{0}'.format(os.getenv('FLAGS_selected_xpus', 0)) place = paddle.set_device(xpu_device) else: place = paddle.set_device('gpu' if use_gpu else 'cpu') if use_fleet: from paddle.distributed import fleet strategy = fleet.DistributedStrategy() fleet.init(is_collective=True, strategy=strategy) if use_fleet: static_model_class.create_optimizer(strategy) else: static_model_class.create_optimizer() exe = paddle.static.Executor(place) # initialize exe.run(paddle.static.default_startup_program()) if model_init_path is not None: load_static_parameter(paddle.static.default_main_program(), model_init_path, prefix='rec_static') last_epoch_id = config.get("last_epoch", -1) # Create a log_visual object and store the data in the path if use_visual: from visualdl import LogWriter log_visual = LogWriter(args.abs_dir + "/visualDL_log/train") else: log_visual = None step_num = 0 if reader_type == 'QueueDataset': dataset, file_list = get_reader(input_data, config) elif reader_type == 'DataLoader': train_dataloader = create_data_loader(config=config, place=place) elif reader_type == "CustomizeDataLoader": train_dataloader = static_model_class.create_data_loader() reader_type = 'DataLoader' for epoch_id in range(last_epoch_id + 1, epochs): epoch_begin = time.time() if use_auc: reset_auc(use_fleet, auc_num) if reader_type == 'DataLoader': fetch_batch_var, step_num = dataloader_train( epoch_id, train_dataloader, input_data_names, fetch_vars, exe, config, use_visual, log_visual, step_num) metric_str = "" for var_idx, var_name in enumerate(fetch_vars): metric_str += "{}: {}, ".format( var_name, str(fetch_batch_var[var_idx]).strip("[]")) logger.info("epoch: {} done, ".format(epoch_id) + metric_str + "epoch time: {:.2f} s".format(time.time() - epoch_begin)) elif reader_type == 'QueueDataset': fetch_batch_var = dataset_train(epoch_id, dataset, fetch_vars, exe, config) logger.info("epoch: {} done, ".format(epoch_id) + "epoch time: {:.2f} s".format(time.time() - epoch_begin)) else: logger.info("reader type wrong") if use_fleet: trainer_id = paddle.distributed.get_rank() if trainer_id == 0: save_static_model(paddle.static.default_main_program(), model_save_path, epoch_id, prefix='rec_static') else: save_static_model(paddle.static.default_main_program(), model_save_path, epoch_id, prefix='rec_static') if use_save_data: save_data(fetch_batch_var, model_save_path) if use_inference: feed_var_names = config.get("runner.save_inference_feed_varnames", []) feedvars = [] fetch_var_names = config.get( "runner.save_inference_fetch_varnames", []) fetchvars = [] for var_name in feed_var_names: if var_name not in paddle.static.default_main_program( ).global_block().vars: raise ValueError( "Feed variable: {} not in default_main_program, global block has follow vars: {}" .format( var_name, paddle.static.default_main_program().global_block( ).vars.keys())) else: feedvars.append(paddle.static.default_main_program(). global_block().vars[var_name]) for var_name in fetch_var_names: if var_name not in paddle.static.default_main_program( ).global_block().vars: raise ValueError( "Fetch variable: {} not in default_main_program, global block has follow vars: {}" .format( var_name, paddle.static.default_main_program().global_block( ).vars.keys())) else: fetchvars.append(paddle.static.default_main_program(). global_block().vars[var_name]) save_inference_model(model_save_path, epoch_id, feedvars, fetchvars, exe)
def label_box(anchors, gt_boxes, positive_overlap, negative_overlap, allow_low_quality, ignore_thresh, is_crowd=None, assign_on_cpu=False): if assign_on_cpu: paddle.set_device("cpu") iou = bbox_overlaps(gt_boxes, anchors) paddle.set_device("gpu") else: iou = bbox_overlaps(gt_boxes, anchors) n_gt = gt_boxes.shape[0] if n_gt == 0 or is_crowd is None: n_gt_crowd = 0 else: n_gt_crowd = paddle.nonzero(is_crowd).shape[0] if iou.shape[0] == 0 or n_gt_crowd == n_gt: # No truth, assign everything to background default_matches = paddle.full((iou.shape[1], ), 0, dtype='int64') default_match_labels = paddle.full((iou.shape[1], ), 0, dtype='int32') return default_matches, default_match_labels # if ignore_thresh > 0, remove anchor if it is closed to # one of the crowded ground-truth if n_gt_crowd > 0: N_a = anchors.shape[0] ones = paddle.ones([N_a]) mask = is_crowd * ones if ignore_thresh > 0: crowd_iou = iou * mask valid = (paddle.sum((crowd_iou > ignore_thresh).cast('int32'), axis=0) > 0).cast('float32') iou = iou * (1 - valid) - valid # ignore the iou between anchor and crowded ground-truth iou = iou * (1 - mask) - mask matched_vals, matches = paddle.topk(iou, k=1, axis=0) match_labels = paddle.full(matches.shape, -1, dtype='int32') # set ignored anchor with iou = -1 neg_cond = paddle.logical_and(matched_vals > -1, matched_vals < negative_overlap) match_labels = paddle.where(neg_cond, paddle.zeros_like(match_labels), match_labels) match_labels = paddle.where(matched_vals >= positive_overlap, paddle.ones_like(match_labels), match_labels) if allow_low_quality: highest_quality_foreach_gt = iou.max(axis=1, keepdim=True) pred_inds_with_highest_quality = paddle.logical_and( iou > 0, iou == highest_quality_foreach_gt).cast('int32').sum(0, keepdim=True) match_labels = paddle.where(pred_inds_with_highest_quality > 0, paddle.ones_like(match_labels), match_labels) matches = matches.flatten() match_labels = match_labels.flatten() return matches, match_labels
def do_train(args): # Initialize the paddle and paddle fleet execute enviroment paddle.enable_static() place = paddle.set_device(args.device) fleet.init(is_collective=True) worker_num = fleet.worker_num() worker_index = fleet.worker_index() # Create the random seed for the worker set_seed(args.seed) worker_init = WorkerInitObj(args.seed + worker_index) # Define the input data in the static mode main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() data_holders = create_data_holder(args) [ input_ids, segment_ids, input_mask, masked_lm_positions, masked_lm_labels, next_sentence_labels, masked_lm_scale ] = data_holders # Define the model structure in static mode args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) config = model_class.pretrained_init_configuration[args.model_name_or_path] if config["vocab_size"] % 8 != 0: config["vocab_size"] += 8 - (config["vocab_size"] % 8) model = BertForPretraining(BertModel(**config)) criterion = BertPretrainingCriterion(model.bert.config["vocab_size"]) prediction_scores, seq_relationship_score = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_positions=masked_lm_positions) loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels, masked_lm_scale) # Define the dynamic learing_reate scheduler and optimizer num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, multi_precision=args.use_pure_fp16) # Use the fleet api to compile the distributed optimizer optimizer = dist_optimizer(args, optimizer) optimizer.minimize(loss) # Define the Executor for running the static model exe = paddle.static.Executor(place) exe.run(startup_program) state_dict = model.state_dict() # Use the state dict to update the parameter reset_state_dict = reset_program_state_dict(model, state_dict) paddle.static.set_program_state(main_program, reset_state_dict) if args.use_amp: optimizer.amp_init(place) pool = ThreadPoolExecutor(1) global_step = 0 tic_train = time.time() epoch = 0 while True: files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and "training" in f ] files.sort() num_files = len(files) random.Random(args.seed + epoch).shuffle(files) f_start_id = 0 # Select one file for each worker and create the DataLoader for the file data_file = select_dataset_file_for_each_worker( files, f_start_id, worker_num, worker_index) train_data_loader, _ = create_pretraining_dataset( data_file, args.max_predictions_per_seq, args, data_holders, worker_init, paddle.static.cuda_places()) for f_id in range(f_start_id + 1, len(files)): data_file = select_dataset_file_for_each_worker( files, f_id, worker_num, worker_index) dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, args, data_holders, worker_init, paddle.static.cuda_places()) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for step, batch in enumerate(train_data_loader): train_reader_cost += time.time() - reader_start global_step += 1 train_start = time.time() loss_return = exe.run(main_program, feed=batch, fetch_list=[loss]) train_run_cost += time.time() - train_start total_samples += args.batch_size # In the new 2.0 api, must call this function to change the learning_rate lr_scheduler.step() if global_step % args.logging_steps == 0: print( "tobal step: %d, epoch: %d, batch: %d, loss: %f, " "avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sequences/sec" % (global_step, epoch, step, loss_return[0], train_reader_cost / args.logging_steps, (train_reader_cost + train_run_cost) / args.logging_steps, total_samples / args.logging_steps, total_samples / (train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 if global_step % args.save_steps == 0: if worker_index == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model.save_model_config(output_dir) paddle.static.save( main_program, os.path.join(output_dir, "model_state")) tokenizer.save_pretrained(output_dir) if global_step >= args.max_steps: reader_start = time.time() del train_data_loader return reader_start = time.time() del train_data_loader train_data_loader, data_file = dataset_future.result(timeout=None) epoch += 1
def main(): parser = PdArgumentParser( (ModelArguments, DataArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Log model and data config training_args.print_config(model_args, "Model") training_args.print_config(data_args, "Data") paddle.set_device(training_args.device) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) data_args.dataset = data_args.dataset.strip() if data_args.dataset in ALL_DATASETS: # if you custom you hyper-parameters in yaml config, it will overwrite all args. config = ALL_DATASETS[data_args.dataset] logger.info("Over-writing training config by yaml config!") for args in (model_args, data_args, training_args): for arg in vars(args): if arg in config.keys(): setattr(args, arg, config[arg]) training_args.per_device_train_batch_size = config["batch_size"] training_args.per_device_eval_batch_size = config["batch_size"] dataset_config = data_args.dataset.split(" ") raw_datasets = load_dataset( dataset_config[0], None if len(dataset_config) <= 1 else dataset_config[1], ) data_args.label_list = getattr(raw_datasets['train'], "label_list", None) num_classes = 1 if raw_datasets["train"].label_list == None else len( raw_datasets['train'].label_list) # Define tokenizer, model, loss function. tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, num_classes=num_classes) criterion = nn.loss.CrossEntropyLoss( ) if data_args.label_list else nn.loss.MSELoss() # Define dataset pre-process function if "clue" in data_args.dataset: trans_fn = partial(clue_trans_fn, tokenizer=tokenizer, args=data_args) else: trans_fn = partial(seq_trans_fn, tokenizer=tokenizer, args=data_args) # Define data collector data_collator = DataCollatorWithPadding(tokenizer) # Dataset pre-process if training_args.do_train: train_dataset = raw_datasets["train"].map(trans_fn) if training_args.do_eval: eval_dataset = raw_datasets["dev"].map(trans_fn) if training_args.do_predict: test_dataset = raw_datasets["test"].map(trans_fn) # Define the metrics of tasks. def compute_metrics(p): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = paddle.to_tensor(preds) label = paddle.to_tensor(p.label_ids) probs = F.softmax(preds, axis=1) metric = Accuracy() metric.reset() result = metric.compute(preds, label) metric.update(result) accu = metric.accumulate() metric.reset() return {"accuracy": accu} trainer = Trainer( model=model, criterion=criterion, args=training_args, data_collator=data_collator, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, compute_metrics=compute_metrics, ) checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint # Training if training_args.do_train: train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model() trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluate and tests model if training_args.do_eval: eval_metrics = trainer.evaluate() trainer.log_metrics("eval", eval_metrics) if training_args.do_predict: test_ret = trainer.predict(test_dataset) trainer.log_metrics("test", test_ret.metrics) if test_ret.label_ids is None: paddle.save( test_ret.predictions, os.path.join(training_args.output_dir, "test_results.pdtensor"), ) # export inference model if training_args.do_export: # You can also load from certain checkpoint # trainer.load_state_dict_from_checkpoint("/path/to/checkpoint/") input_spec = [ paddle.static.InputSpec(shape=[None, None], dtype="int64"), # input_ids paddle.static.InputSpec(shape=[None, None], dtype="int64") # segment_ids ] if model_args.export_model_dir is None: model_args.export_model_dir = os.path.join( training_args.output_dir, "export") paddlenlp.transformers.export_model(model=trainer.model, input_spec=input_spec, path=model_args.export_model_dir)
def do_train(args): # Initialize the paddle execute enviroment paddle.enable_static() place = paddle.set_device(args.select_device) # Set the random seed set_seed(args.seed) # Define the input data in the static mode main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() data_holders = create_data_holder(args) [ input_ids, segment_ids, input_mask, masked_lm_positions, masked_lm_labels, next_sentence_labels, masked_lm_scale ] = data_holders # Define the model structure in static mode args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) config = model_class.pretrained_init_configuration[args.model_name_or_path] if config["vocab_size"] % 8 != 0: config["vocab_size"] += 8 - (config["vocab_size"] % 8) model = BertForPretraining(BertModel(**config)) criterion = BertPretrainingCriterion(model.bert.config["vocab_size"]) prediction_scores, seq_relationship_score = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_positions=masked_lm_positions) loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels, masked_lm_scale) # Define the dynamic learing_reate scheduler and optimizer num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ], multi_precision=False) if args.use_amp: custom_black_list = (['lookup_table', 'lookup_table_v2'] if args.use_pure_fp16 else None) amp_list = paddle.static.amp.AutoMixedPrecisionLists( custom_white_list=['layer_norm', 'softmax', 'gelu'], custom_black_list=custom_black_list) optimizer = paddle.static.amp.decorate( optimizer, amp_list, init_loss_scaling=args.scale_loss, use_dynamic_loss_scaling=True, use_pure_fp16=args.use_pure_fp16) optimizer.minimize(loss) # Define the Executor for running the static model exe = paddle.static.Executor(place) exe.run(startup_program) state_dict = model.state_dict() # Use the state dict to update the parameter reset_state_dict = reset_program_state_dict(model, state_dict) paddle.static.set_program_state(main_program, reset_state_dict) if args.use_amp: optimizer.amp_init(place) # Construct the compiled program main_program = build_compiled_program(args, main_program, loss) global_step = 0 tic_train = time.time() epoch = 0 while True: files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and "training" in f ] files.sort() random.Random(args.seed + epoch).shuffle(files) for f_id in range(0, len(files)): train_data_loader, _ = create_pretraining_dataset( files[f_id], args.max_predictions_per_seq, args, data_holders) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for step, batch in enumerate(train_data_loader): train_reader_cost += time.time() - reader_start global_step += 1 train_start = time.time() loss_return = exe.run(main_program,\ feed=batch, fetch_list=[loss]) train_run_cost += time.time() - train_start total_samples += args.batch_size # In the new 2.0 api, must call this function to change the learning_rate lr_scheduler.step() if global_step % args.logging_steps == 0: print( "global step: %d, epoch: %d, batch: %d, loss: %f, " "avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sequences/sec" % (global_step, epoch, step, loss_return[0], train_reader_cost / args.logging_steps, (train_reader_cost + train_run_cost) / args.logging_steps, total_samples / args.logging_steps, total_samples / (train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 if global_step % args.save_steps == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # TODO(fangzeyang): Udpate the save_params to paddle.static paddle.fluid.io.save_params(exe, output_dir) tokenizer.save_pretrained(output_dir) if global_step >= args.max_steps: reader_start = time.time() del train_data_loader return reader_start = time.time() del train_data_loader epoch += 1
def do_train(args): paddle.enable_static() if not args.eager_run else None paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank()) args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] # Loads or initializes a model. pretrained_models = list(tokenizer_class.pretrained_init_configuration.keys( )) def get_opt_config(model_cls, name): config = model_cls.pretrained_init_configuration[name] # Optimize for AMP. if "vocab_size" in config: if config["vocab_size"] % 8 != 0: config["vocab_size"] += 8 - (config["vocab_size"] % 8) return config if args.model_name_or_path in pretrained_models: tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) generator = ElectraGenerator( ElectraModel(**get_opt_config(model_class, args.model_name_or_path + "-generator"))) discriminator = ElectraDiscriminator( ElectraModel(**get_opt_config(model_class, args.model_name_or_path + "-discriminator"))) model = model_class(generator, discriminator) args.init_from_ckpt = False else: if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt: # Load checkpoint tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) with open( os.path.join(args.model_name_or_path, "run_states.json"), 'r') as f: config_dict = json.load(f) model_name = config_dict["model_name"] if model_name in pretrained_models: generator = ElectraGenerator( ElectraModel(**get_opt_config(model_class, model_name + "-generator"))) discriminator = ElectraDiscriminator( ElectraModel(**get_opt_config(model_class, model_name + "-discriminator"))) model = model_class(generator, discriminator) model.set_state_dict( paddle.load( os.path.join(args.model_name_or_path, "model_state.pdparams"))) else: raise ValueError( "initialize a model from ckpt need model_name " "in model_config_file. The supported model_name " "are as follows: {}".format( tokenizer_class.pretrained_init_configuration.keys())) else: raise ValueError( "initialize a model need identifier or the " "directory of storing model. if use identifier, the supported model " "identifiers are as follows: {}, if use directory, " "make sure set init_from_ckpt as True".format( model_class.pretrained_init_configuration.keys())) criterion = ElectraPretrainingCriterion( getattr(model.generator, ElectraGenerator.base_model_prefix).config["vocab_size"], model.gen_weight, model.disc_weight) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) # Loads dataset. tic_load_data = time.time() print("start load data : %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) train_dataset = BookCorpus( data_path=args.input_dir, tokenizer=tokenizer, max_seq_length=args.max_seq_length, mode='train') print("load data done, total : %s s" % (time.time() - tic_load_data)) # Reads data and generates mini-batches. data_collator = DataCollatorForElectra( tokenizer=tokenizer, max_seq_length=args.max_seq_length, mlm=True, mlm_probability=args.mask_prob) train_data_loader = create_dataloader( train_dataset, batch_size=args.train_batch_size, mode='train', use_gpu=True if args.device in "gpu" else False, data_collator=data_collator) num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_train_epochs) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in decay_params) if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=1024) print("start train : %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) trained_global_step = global_step = 0 t_loss = paddle.to_tensor([0.0]) log_loss = paddle.to_tensor([0.0]) loss_list = [] log_list = [] tic_train = time.time() if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt: optimizer.set_state_dict( paddle.load( os.path.join(args.model_name_or_path, "model_state.pdopt"))) trained_global_step = global_step = config_dict["global_step"] if trained_global_step < num_training_steps: print( "[ start train from checkpoint ] we have already trained %s steps, seeking next step : %s" % (trained_global_step, trained_global_step + 1)) else: print( "[ start train from checkpoint ] we have already trained %s steps, but total training steps is %s, please check configuration !" % (trained_global_step, num_training_steps)) exit(0) for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): if trained_global_step > 0: trained_global_step -= 1 continue global_step += 1 input_ids, raw_input_ids, gen_labels = batch if args.use_amp: with paddle.amp.auto_cast(): gen_logits, disc_logits, disc_labels, attention_mask = model( input_ids=input_ids, raw_input_ids=raw_input_ids, gen_labels=gen_labels) loss = criterion(gen_logits, disc_logits, gen_labels, disc_labels, attention_mask) scaled = scaler.scale(loss) scaled.backward() t_loss += loss.detach() scaler.minimize(optimizer, scaled) else: gen_logits, disc_logits, disc_labels, attention_mask = model( input_ids=input_ids, raw_input_ids=raw_input_ids, gen_labels=gen_labels) loss = criterion(gen_logits, disc_logits, gen_labels, disc_labels, attention_mask) loss.backward() t_loss += loss.detach() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: local_loss = (t_loss - log_loss) / args.logging_steps if (paddle.distributed.get_world_size() > 1): paddle.distributed.all_gather(loss_list, local_loss) if paddle.distributed.get_rank() == 0: log_str = ( "global step {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, " "avg_loss: {4:.15f}, lr: {5:.10f}, speed: {6:.2f} s/it" ).format(global_step, num_training_steps, epoch, step, float((paddle.stack(loss_list).sum() / len( loss_list)).numpy()), optimizer.get_lr(), (time.time() - tic_train) / args.logging_steps) print(log_str) log_list.append(log_str) loss_list = [] else: log_str = ( "global step {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, " "loss: {4:.15f}, lr: {5:.10f}, speed: {6:.2f} s/it" ).format(global_step, num_training_steps, epoch, step, float(local_loss.numpy()), optimizer.get_lr(), (time.time() - tic_train) / args.logging_steps) print(log_str) log_list.append(log_str) log_loss = t_loss tic_train = time.time() if global_step % args.save_steps == 0: if paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d.pdparams" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model config_to_save = copy.deepcopy( model_to_save.discriminator.electra.config) if 'self' in config_to_save: del config_to_save['self'] run_states = { "model_name": model_name if args.init_from_ckpt else args.model_name_or_path, "global_step": global_step, "epoch": epoch, "step": step, } with open( os.path.join(output_dir, "model_config.json"), 'w') as f: json.dump(config_to_save, f) with open( os.path.join(output_dir, "run_states.json"), 'w') as f: json.dump(run_states, f) paddle.save(model.state_dict(), os.path.join(output_dir, "model_state.pdparams")) tokenizer.save_pretrained(output_dir) paddle.save(optimizer.state_dict(), os.path.join(output_dir, "model_state.pdopt")) if len(log_list) > 0: with open(os.path.join(output_dir, "train.log"), 'w') as f: for log in log_list: if len(log.strip()) > 0: f.write(log.strip() + '\n') if global_step >= num_training_steps: return
batch) query_ids = paddle.to_tensor(query_ids) title_ids = paddle.to_tensor(title_ids) query_seq_lens = paddle.to_tensor(query_seq_lens) title_seq_lens = paddle.to_tensor(title_seq_lens) logits = model(query_ids, title_ids, query_seq_lens, title_seq_lens) probs = F.softmax(logits, axis=1) idx = paddle.argmax(probs, axis=1).numpy() idx = idx.tolist() labels = [label_map[i] for i in idx] results.extend(labels) return results if __name__ == "__main__": paddle.set_device("gpu") if args.use_gpu else paddle.set_device("cpu") # Loads vocab. vocab = Vocab.load_vocabulary(args.vocab_path, unk_token='[UNK]', pad_token='[PAD]') tokenizer = JiebaTokenizer(vocab) label_map = {0: 'dissimilar', 1: 'similar'} # Constructs the newtork. model = ppnlp.models.SimNet(network=args.network, vocab_size=len(vocab), num_classes=len(label_map)) # Loads model parameters. state_dict = paddle.load(args.params_path) model.set_dict(state_dict)
def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False): """ Predicts the data labels. Args: data (obj:`List(str)`): The processed data whose each element is the raw text. max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`): If set to a number, will limit the total sequence returned so that it has a maximum length. batch_size(obj:`int`, defaults to 1): The number of batch. use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not. Returns: results(obj:`list`): All the predictions labels. """ # TODO(zhangxuefei): add task token_classification task predict. if self.task not in ['sequence_classification']: raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task) paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') tokenizer = self.get_tokenizer() examples = [] for text in data: if len(text) == 1: encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len) elif len(text) == 2: encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len) else: raise RuntimeError( 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids'])) def _batchify_fn(batch): input_ids = [entry[0] for entry in batch] segment_ids = [entry[1] for entry in batch] return input_ids, segment_ids # Seperates data into some batches. batches = [] one_batch = [] for example in examples: one_batch.append(example) if len(one_batch) == batch_size: batches.append(one_batch) one_batch = [] if one_batch: # The last batch whose size is less than the config batch_size setting. batches.append(one_batch) results = [] self.eval() for batch in batches: input_ids, segment_ids = _batchify_fn(batch) input_ids = paddle.to_tensor(input_ids) segment_ids = paddle.to_tensor(segment_ids) # TODO(zhangxuefei): add task token_classification postprocess after prediction. if self.task == 'sequence_classification': probs = self(input_ids, segment_ids) idx = paddle.argmax(probs, axis=1).numpy() idx = idx.tolist() labels = [self.label_map[i] for i in idx] results.extend(labels) return results
def train(args): paddle.set_device(args.device) trainer_num = paddle.distributed.get_world_size() if trainer_num > 1: paddle.distributed.init_parallel_env() rank = paddle.distributed.get_rank() # Create dataset. train_ds, test_ds = load_dataset( datafiles=(os.path.join(args.data_dir, 'train.tsv'), os.path.join(args.data_dir, 'test.tsv'))) word_vocab = load_vocab(os.path.join(args.data_dir, 'word.dic')) label_vocab = load_vocab(os.path.join(args.data_dir, 'tag.dic')) # q2b.dic is used to replace DBC case to SBC case normlize_vocab = load_vocab(os.path.join(args.data_dir, 'q2b.dic')) trans_func = partial(convert_example, max_seq_len=args.max_seq_len, word_vocab=word_vocab, label_vocab=label_vocab, normlize_vocab=normlize_vocab) train_ds.map(trans_func) test_ds.map(trans_func) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=word_vocab.get("[PAD]", 0), dtype='int64' ), # word_ids Stack(dtype='int64'), # length Pad(axis=0, pad_val=label_vocab.get("O", 0), dtype='int64' ), # label_ids ): fn(samples) # Create sampler for dataloader train_sampler = paddle.io.DistributedBatchSampler( dataset=train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_loader = paddle.io.DataLoader(dataset=train_ds, batch_sampler=train_sampler, return_list=True, collate_fn=batchify_fn) test_sampler = paddle.io.BatchSampler(dataset=test_ds, batch_size=args.batch_size, shuffle=False, drop_last=False) test_loader = paddle.io.DataLoader(dataset=test_ds, batch_sampler=test_sampler, return_list=True, collate_fn=batchify_fn) # Define the model netword and its loss model = BiGruCrf(args.emb_dim, args.hidden_size, len(word_vocab), len(label_vocab), crf_lr=args.crf_lr) # Prepare optimizer, loss and metric evaluator optimizer = paddle.optimizer.Adam(learning_rate=args.base_lr, parameters=model.parameters()) chunk_evaluator = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True) if args.init_checkpoint: if os.path.exists(args.init_checkpoint): logger.info("Init checkpoint from %s" % args.init_checkpoint) model_dict = paddle.load(args.init_checkpoint) model.load_dict(model_dict) else: logger.info("Cannot init checkpoint from %s which doesn't exist" % args.init_checkpoint) logger.info("Start training") # Start training global_step = 0 last_step = args.epochs * len(train_loader) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() max_f1_score = -1 for epoch in range(args.epochs): for step, batch in enumerate(train_loader): train_reader_cost += time.time() - reader_start global_step += 1 token_ids, length, label_ids = batch train_start = time.time() loss = model(token_ids, length, label_ids) avg_loss = paddle.mean(loss) train_run_cost += time.time() - train_start total_samples += args.batch_size if global_step % args.logging_steps == 0: logger.info( "global step %d / %d, loss: %f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sequences/sec" % (global_step, last_step, avg_loss, train_reader_cost / args.logging_steps, (train_reader_cost + train_run_cost) / args.logging_steps, total_samples / args.logging_steps, total_samples / (train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 avg_loss.backward() optimizer.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == last_step: if rank == 0: paddle.save( model.state_dict(), os.path.join(args.model_save_dir, "model_%d.pdparams" % global_step)) logger.info("Save %d steps model." % (global_step)) if args.do_eval: precision, recall, f1_score = evaluate( model, chunk_evaluator, test_loader) if f1_score > max_f1_score: max_f1_score = f1_score paddle.save( model.state_dict(), os.path.join(args.model_save_dir, "best_model.pdparams")) logger.info("Save best model.") reader_start = time.time()
def do_train(): set_seed(args.seed) paddle.set_device("gpu" if args.n_gpu else "cpu") world_size = paddle.distributed.get_world_size() if world_size > 1: paddle.distributed.init_parallel_env() train_ds, dev_ds, test_ds = load_dataset("chnsenticorp", splits=["train", "dev", "test"]) # If you wanna use bert/roberta/electra pretrained model, # model = ppnlp.transformers.BertForSequenceClassification.from_pretrained('bert-base-chinese', num_class=2) # model = ppnlp.transformers.RobertaForSequenceClassification.from_pretrained('roberta-wwm-ext', num_class=2) # model = ppnlp.transformers.ElectraForSequenceClassification.from_pretrained('chinese-electra-small', num_classes=2) model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained( 'ernie-tiny', num_classes=len(train_ds.label_list)) # If you wanna use bert/roberta/electra pretrained model, # tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese') # tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext') # tokenizer = ppnlp.transformers.ElectraTokenizer.from_pretrained('chinese-electra-small', num_classes=2) # ErnieTinyTokenizer is special for ernie-tiny pretained model. tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained( 'ernie-tiny') trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64") # label ): [data for data in fn(samples)] train_data_loader = create_dataloader(train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader(dev_ds, mode='dev', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) test_data_loader = create_dataloader(test_ds, mode='test', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): input_ids, token_type_ids, labels = batch logits = model(input_ids, token_type_ids) loss = criterion(logits, labels) probs = F.softmax(logits, axis=1) correct = metric.compute(probs, labels) metric.update(correct) acc = metric.accumulate() global_step += 1 if global_step % 10 == 0 and paddle.distributed.get_rank() == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, acc, 10 / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % 100 == 0 and paddle.distributed.get_rank() == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) evaluate(model, criterion, metric, dev_data_loader) model._layers.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir) if paddle.distributed.get_rank() == 0: print('Evaluating on test data.') evaluate(model, criterion, metric, test_data_loader)
def main(): args = parse_args() # device setup paddle.set_device("gpu") # load model model = resnet50(pretrained=True) model.eval() # set model's intermediate outputs outputs = [] def hook(module, input, output): outputs.append(output) model.layer1[-1].register_forward_post_hook(hook) model.layer2[-1].register_forward_post_hook(hook) model.layer3[-1].register_forward_post_hook(hook) model.avgpool.register_forward_post_hook(hook) os.makedirs(os.path.join(args.save_path, 'temp'), exist_ok=True) fig, ax = plt.subplots(1, 2, figsize=(20, 10)) fig_img_rocauc = ax[0] fig_pixel_rocauc = ax[1] total_roc_auc = [] total_pixel_roc_auc = [] for class_name in mvtec.CLASS_NAMES: train_dataset = mvtec.MVTecDataset(class_name=class_name, is_train=True) train_dataloader = DataLoader(train_dataset, batch_size=32) test_dataset = mvtec.MVTecDataset(class_name=class_name, is_train=False) test_dataloader = DataLoader(test_dataset, batch_size=1) train_outputs = OrderedDict([('layer1', []), ('layer2', []), ('layer3', []), ('avgpool', [])]) test_outputs = OrderedDict([('layer1', []), ('layer2', []), ('layer3', []), ('avgpool', [])]) # extract train set features train_feature_filepath = os.path.join(args.save_path, 'temp', 'train_%s.pkl' % class_name) if not os.path.exists(train_feature_filepath): # extract train set features for (x, y, mask) in tqdm( train_dataloader, '| feature extraction | train | %s |' % class_name): with paddle.no_grad(): pred = model(x) for k, v in zip(train_outputs.keys(), outputs): train_outputs[k].append(v) # initialize hook outputs outputs = [] # transfer to np.array in order to save by pickle for k, v in train_outputs.items(): train_outputs[k] = paddle.concat(v, 0).numpy() with open(train_feature_filepath, 'wb') as f: pickle.dump(train_outputs, f) # transfer back to paddle.Tensor() in order to continue to compute in test stage for k, v in train_outputs.items(): train_outputs[k] = paddle.to_tensor(v) else: print('load train set feature from: %s' % train_feature_filepath) with open(train_feature_filepath, 'rb') as f: train_outputs = pickle.load(f) for k, v in train_outputs.items(): train_outputs[k] = paddle.to_tensor(v) gt_list = [] gt_mask_list = [] test_imgs = [] # extract test set features for (x, y, mask) in tqdm(test_dataloader, '| feature extraction | test | %s |' % class_name): test_imgs.extend(x.cpu().detach().numpy()) gt_list.extend(y.cpu().detach().numpy()) gt_mask_list.extend(mask.cpu().detach().numpy()) # model prediction with paddle.no_grad(): pred = model(x) for k, v in zip(test_outputs.keys(), outputs): test_outputs[k].append(v) # initialize hook outputs outputs = [] for k, v in test_outputs.items(): test_outputs[k] = paddle.concat(v, 0) # calculate distance matrix dist_matrix = calc_dist_matrix( paddle.flatten(test_outputs['avgpool'], 1), paddle.flatten(train_outputs['avgpool'], 1)) # select K nearest neighbor and take average topk_values, topk_indexes = paddle.topk(dist_matrix, k=args.top_k, largest=False) scores = paddle.mean(topk_values, 1).cpu().detach().numpy() # calculate image-level ROC AUC score fpr, tpr, _ = roc_curve(gt_list, scores) roc_auc = roc_auc_score(gt_list, scores) total_roc_auc.append(roc_auc) print('%s ROCAUC: %.3f' % (class_name, roc_auc)) fig_img_rocauc.plot(fpr, tpr, label='%s ROCAUC: %.3f' % (class_name, roc_auc)) score_map_list = [] for t_idx in tqdm(range(test_outputs['avgpool'].shape[0]), '| localization | test | %s |' % class_name): score_maps = [] for layer_name in ['layer1', 'layer2', 'layer3']: # for each layer # construct a gallery of features at all pixel locations of the K nearest neighbors topk_feat_map = paddle.stack([ train_outputs[layer_name][idx] for idx in topk_indexes[t_idx].numpy().tolist() ]) test_feat_map = test_outputs[layer_name][t_idx:t_idx + 1] feat_gallery = paddle.transpose(topk_feat_map, [0, 3, 2, 1]) feat_gallery = paddle.flatten( feat_gallery, start_axis=0, stop_axis=2).unsqueeze(-1).unsqueeze(-1) # calculate distance matrix dist_matrix_list = [] for d_idx in range(feat_gallery.shape[0] // 100): dist = paddle.nn.PairwiseDistance() dist_matrix = dist( feat_gallery[d_idx * 100:d_idx * 100 + 100], test_feat_map) dist_matrix_list.append(dist_matrix) dist_matrix = paddle.concat(dist_matrix_list, 0) # k nearest features from the gallery (k=1) score_map = paddle.min(dist_matrix, axis=0) score_map = F.interpolate(score_map.unsqueeze(0).unsqueeze(0), size=[224, 224], mode='bilinear', align_corners=False) score_maps.append(score_map) # average distance between the features score_map = paddle.mean(paddle.concat(score_maps, 0), axis=0) # apply gaussian smoothing on the score map score_map = gaussian_filter( score_map.squeeze().cpu().detach().numpy(), sigma=4) score_map_list.append(score_map) flatten_gt_mask_list = np.concatenate(gt_mask_list).ravel().astype(int) flatten_score_map_list = np.concatenate(score_map_list).ravel() # calculate per-pixel level ROCAUC fpr, tpr, _ = roc_curve(flatten_gt_mask_list, flatten_score_map_list) per_pixel_rocauc = roc_auc_score(flatten_gt_mask_list, flatten_score_map_list) total_pixel_roc_auc.append(per_pixel_rocauc) print('%s pixel ROCAUC: %.3f' % (class_name, per_pixel_rocauc)) fig_pixel_rocauc.plot(fpr, tpr, label='%s ROCAUC: %.3f' % (class_name, per_pixel_rocauc)) # get optimal threshold precision, recall, thresholds = precision_recall_curve( flatten_gt_mask_list, flatten_score_map_list) a = 2 * precision * recall b = precision + recall f1 = np.divide(a, b, out=np.zeros_like(a), where=b != 0) threshold = thresholds[np.nanargmax(f1)] # visualize localization result visualize_loc_result(test_imgs, gt_mask_list, score_map_list, threshold, args.save_path, class_name, vis_num=5) print('Average ROCAUC: %.3f' % np.nanmean(total_roc_auc)) fig_img_rocauc.title.set_text('Average image ROCAUC: %.3f' % np.nanmean(total_roc_auc)) fig_img_rocauc.legend(loc="lower right") print('Average pixel ROCUAC: %.3f' % np.nanmean(total_pixel_roc_auc)) fig_pixel_rocauc.title.set_text('Average pixel ROCAUC: %.3f' % np.nanmean(total_pixel_roc_auc)) fig_pixel_rocauc.legend(loc="lower right") fig.tight_layout() fig.savefig(os.path.join(args.save_path, 'roc_curve.png'), dpi=100)
def run(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) set_seed(args) if paddle.distributed.get_rank() == 0: if os.path.exists(args.model_name_or_path): print("init checkpoint from %s" % args.model_name_or_path) model = model_class.from_pretrained(args.model_name_or_path) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. contexts = [examples[i]['context'] for i in range(len(examples))] questions = [examples[i]['question'] for i in range(len(examples))] tokenized_examples = tokenizer( questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length) for i, tokenized_example in enumerate(tokenized_examples): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_example["input_ids"] cls_index = input_ids.index(tokenizer.cls_token_id) # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offsets = tokenized_example['offset_mapping'] # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_example['token_type_ids'] # One example can give several spans, this is the index of the example containing this span of text. sample_index = tokenized_example['overflow_to_sample'] answers = examples[sample_index]['answers'] answer_starts = examples[sample_index]['answer_starts'] # If no answers are given, set the cls_index as answer. if len(answer_starts) == 0: tokenized_examples[i]["start_positions"] = cls_index tokenized_examples[i]["end_positions"] = cls_index tokenized_examples[i]['answerable_label'] = 0 else: # Start/end character index of the answer in the text. start_char = answer_starts[0] end_char = start_char + len(answers[0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != 1: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 2 while sequence_ids[token_end_index] != 1: token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples[i]["start_positions"] = cls_index tokenized_examples[i]["end_positions"] = cls_index tokenized_examples[i]['answerable_label'] = 0 else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples[i][ "start_positions"] = token_start_index - 1 while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples[i]["end_positions"] = token_end_index + 1 tokenized_examples[i]['answerable_label'] = 1 return tokenized_examples if args.do_train: assert args.train_file != None, "--train_file should be set when training!" train_ds = DuReaderChecklist().read(args.train_file) train_ds.map(prepare_train_features, batched=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64"), "answerable_label": Stack(dtype="int64") }): fn(samples) train_data_loader = DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs if paddle.distributed.get_rank() == 0: # dev_count = paddle.fluid.core.get_cuda_device_count() # print("Device count: %d" % dev_count) print("Num train examples: %d" % len(train_ds.data)) print("Max train steps: %d" % num_training_steps) lr_scheduler = LinearDecayWithWarmup( args.learning_rate, num_training_steps, args.warmup_proportion) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) criterion = CrossEntropyLossForChecklist() global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, start_positions, end_positions, answerable_label = batch logits = model(input_ids=input_ids, token_type_ids=segment_ids) loss = criterion(logits, (start_positions, end_positions,answerable_label)) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.save_steps == 0 or global_step == num_training_steps: if paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. contexts = [examples[i]['context'] for i in range(len(examples))] questions = [examples[i]['question'] for i in range(len(examples))] tokenized_examples = tokenizer( questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length) # For validation, there is no need to compute start and end positions for i, tokenized_example in enumerate(tokenized_examples): # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_example['token_type_ids'] # One example can give several spans, this is the index of the example containing this span of text. sample_index = tokenized_example['overflow_to_sample'] tokenized_examples[i]["example_id"] = examples[sample_index]['id'] # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples[i]["offset_mapping"] = [ (o if sequence_ids[k] == 1 else None) for k, o in enumerate(tokenized_example["offset_mapping"]) ] return tokenized_examples if args.do_pred: input_files = [] assert args.predict_file != None, "--predict_file should be set when predicting!" for input_pattern in args.predict_file: input_files.extend(glob.glob(input_pattern)) assert len(input_files) > 0, 'Can not find predict_file {}'.format(args.predict_file) for input_file in input_files: print('Run prediction on {}'.format(input_file)) prefix = os.path.basename(input_file) prefix = re.sub('.json', '', prefix) dev_ds = DuReaderChecklist().read(input_file) dev_ds.map(prepare_validation_features, batched=True) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.batch_size, shuffle=False) dev_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) dev_data_loader = DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) if paddle.distributed.get_rank() == 0: evaluate(model, dev_data_loader, args, prefix=prefix)
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import librosa import numpy as np import paddle import paddleaudio import pytest from paddleaudio.transforms import ISTFT, STFT, MelSpectrogram from paddleaudio.utils._librosa import melspectrogram paddle.set_device('cpu') EPS = 1e-8 import itertools from utils import load_example_audio1 # test case for stft def generate_stft_test(): n_fft = [512, 1024] hop_length = [160, 320] window = [ 'hann', 'hamming', ('gaussian', 100), #, ('tukey', 0.5), 'blackman'
def run(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() rank = paddle.distributed.get_rank() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) set_seed(args) train_examples = load_dataset('PaddlePaddle/dureader_robust', split="train") dev_examples = load_dataset('PaddlePaddle/dureader_robust', split="validation") column_names = train_examples.column_names if rank == 0: if os.path.exists(args.model_name_or_path): print("init checkpoint from %s" % args.model_name_or_path) model = model_class.from_pretrained(args.model_name_or_path) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. # NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead. contexts = examples['context'] questions = examples['question'] tokenized_examples = tokenizer(questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples['token_type_ids'][i] # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples['answers'][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != 1: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != 1: token_end_index -= 1 token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples["start_positions"].append( token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append( token_end_index + 1) return tokenized_examples if args.do_train: train_ds = train_examples.map(prepare_train_features, batched=True, remove_columns=column_names, num_proc=4) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64") }): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = CrossEntropyLossForSQuAD() global_step = 0 tic_train = time.time() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, start_positions, end_positions = batch logits = model(input_ids=input_ids, token_type_ids=token_type_ids) loss = criterion(logits, (start_positions, end_positions)) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch + 1, step + 1, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == num_training_steps: if rank == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) if global_step == num_training_steps: break def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. #NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead. contexts = examples['context'] questions = examples['question'] tokenized_examples = tokenizer(questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length, return_attention_mask=True) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] for i in range(len(tokenized_examples["input_ids"])): # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples['token_type_ids'][i] context_index = 1 # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append( examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_index else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples if args.do_predict and rank == 0: dev_ds = dev_examples.map(prepare_validation_features, batched=True, remove_columns=column_names, num_proc=4) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) evaluate(model, dev_examples, dev_data_loader, args)
def do_train(args): assert args.device in [ "cpu", "gpu", "xpu" ], "Invalid device! Available device should be cpu, gpu, or xpu." paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() worker_index = paddle.distributed.get_rank() worker_num = paddle.distributed.get_world_size() set_seed(args) worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank()) model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) eod_id = tokenizer.command_name_map["eod"].Id pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) if args.model_name_or_path in pretrained_models_list: model = GPT2ForPretraining( GPT2Model(**model_class.pretrained_init_configuration[ args.model_name_or_path])) else: model = GPT2ForPretraining.from_pretrained(args.model_name_or_path) if args.decay_steps is None: args.decay_steps = args.max_steps warmup_step = args.warmup_rate * args.decay_steps lr_scheduler = lr.CosineAnnealingWithWarmupDecay( max_lr=args.max_lr, min_lr=args.min_lr, warmup_step=warmup_step, decay_step=args.decay_steps) clip = None if args.grad_clip > 0: clip = paddle.nn.ClipGradByNorm(clip_norm=args.grad_clip) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) if args.model_name_or_path not in pretrained_models_list: opt_dict = paddle.load( os.path.join(args.model_name_or_path, "model_state.pdopt")) optimizer.set_state_dict(opt_dict) # creat the critrion for the gpt model criterion = GPT2PretrainingCriterion() global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if (os.path.isfile(os.path.join(args.input_dir, f)) and "npz_" not in str(f)) ] files.sort() num_files = len(files) for f_id in range(num_files): data_file = files[f_id] train_data_loader = create_pretrained_dataset(args, data_file, worker_init, worker_index, worker_num, eod_id=eod_id) for step, batch in enumerate(train_data_loader): global_step += 1 tokens, loss_mask, attention_mask, position_ids, labels = batch loss_mask.stop_gradient = True attention_mask.stop_gradient = True preds = model(tokens, position_ids, attention_mask) loss = criterion(preds, labels, loss_mask) if global_step % args.logging_steps == 0: if worker_index == 0: logger.info( "global step %d, epoch: %d, lr: %.10f, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, optimizer.get_lr(), step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step >= args.max_steps: if worker_index == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model logger.info("Save model to %s" % output_dir) model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) paddle.save( optimizer.state_dict(), os.path.join(output_dir, "model_state.pdopt")) if global_step >= args.max_steps: logger.info("The training process is complete.") del train_data_loader return del train_data_loader
parser.add_argument("--num_angle", type=int, default=4) parser.add_argument('--max_dist_2d', type=float, default=3.) parser.add_argument('--cut_dist', type=float, default=5.) parser.add_argument("--spa_w", type=float, default=0.1) parser.add_argument("--gcl_w", type=float, default=1) parser.add_argument("--tau", type=float, default=0.05) args = parser.parse_args() setup_seed(args.seed) if not args.num_dist: args.num_dist = 2 if args.cut_dist <= 4 else 4 if not os.path.isdir(args.model_dir): os.mkdir(args.model_dir) if int(args.cuda) == -1: paddle.set_device('cpu') else: paddle.set_device('gpu:%s' % args.cuda) if args.dataset in ['esol', 'lipop', 'freesolv']: args.task = 'regression' elif args.dataset in ['clintox', 'sider', 'tox21', 'toxcast']: args.task = ' classification' else: print('The dataset %s is not included.' % args.dataset) exit(-1) data_2d = Molecule2DView(args.data_dir, args.dataset) data_3d = Molecule3DView(args.data_dir, args.dataset, args.cut_dist, args.num_angle, args.num_dist) assert len(data_2d) == len(data_3d) node_in_dim = data_2d.atom_feat_dim
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank()) args.model_type = args.model_type.lower() base_class, model_class, criterion_class, tokenizer_class = MODEL_CLASSES[ args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) model = model_class( base_class(**model_class.pretrained_init_configuration[ args.model_name_or_path])) criterion = criterion_class( getattr(model, model_class.base_model_prefix).config["vocab_size"]) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) # If use defalut last_epoch, lr of the first iteration is 0. # Use `last_epoch = 0` to be consistent with nv bert. num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps, last_epoch=0) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) pool = ThreadPoolExecutor(1) global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and "training" in f ] files.sort() num_files = len(files) random.Random(args.seed + epoch).shuffle(files) f_start_id = 0 shared_file_list = {} if paddle.distributed.get_world_size() > num_files: remainder = paddle.distributed.get_world_size() % num_files data_file = files[ (f_start_id * paddle.distributed.get_world_size() + paddle.distributed.get_rank() + remainder * f_start_id) % num_files] else: data_file = files[ (f_start_id * paddle.distributed.get_world_size() + paddle.distributed.get_rank()) % num_files] previous_file = data_file train_data_loader, _ = create_pretraining_dataset( data_file, args.max_predictions_per_seq, shared_file_list, args, worker_init) # TODO(guosheng): better way to process single file single_file = True if f_start_id + 1 == len(files) else False for f_id in range(f_start_id, len(files)): if not single_file and f_id == f_start_id: continue if paddle.distributed.get_world_size() > num_files: data_file = files[(f_id * paddle.distributed.get_world_size() + paddle.distributed.get_rank() + remainder * f_id) % num_files] else: data_file = files[(f_id * paddle.distributed.get_world_size() + paddle.distributed.get_rank()) % num_files] previous_file = data_file dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args, worker_init) for step, batch in enumerate(train_data_loader): global_step += 1 (input_ids, segment_ids, input_mask, masked_lm_positions, masked_lm_labels, next_sentence_labels, masked_lm_scale) = batch with paddle.amp.auto_cast( args.use_amp, custom_white_list=["layer_norm", "softmax", "gelu"]): prediction_scores, seq_relationship_score = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_positions=masked_lm_positions) loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels, masked_lm_scale) if args.use_amp: scaler.scale(loss).backward() scaler.minimize(optimizer, loss) else: loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: if paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0: if paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) paddle.save( optimizer.state_dict(), os.path.join(output_dir, "model_state.pdopt")) if global_step >= args.max_steps: del train_data_loader return del train_data_loader train_data_loader, data_file = dataset_future.result(timeout=None)
shuffle = True if mode == 'train' else False if mode == "train": sampler = paddle.io.DistributedBatchSampler( dataset=dataset, batch_size=batch_size, shuffle=shuffle) else: sampler = paddle.io.BatchSampler( dataset=dataset, batch_size=batch_size, shuffle=shuffle) dataloader = paddle.io.DataLoader( dataset, batch_sampler=sampler, collate_fn=batchify_fn) return dataloader if __name__ == "__main__": paddle.set_device(args.device) set_seed() # Loads vocab. if not os.path.exists(args.vocab_path): raise RuntimeError('The vocab_path can not be found in the path %s' % args.vocab_path) vocab = Vocab.load_vocabulary( args.vocab_path, unk_token='[UNK]', pad_token='[PAD]') # Loads dataset. train_ds, dev_ds, test_ds = load_dataset( "chnsenticorp", splits=["train", "dev", "test"]) # Constructs the newtork. model = ppnlp.models.Senta(
def test_fit_by_epoch(self): base_lr = 1e-3 boundaries = [5, 8] epochs = 10 wamup_epochs = 4 def make_optimizer(parameters=None): momentum = 0.9 weight_decay = 5e-4 values = [base_lr * (0.1**i) for i in range(len(boundaries) + 1)] learning_rate = paddle.optimizer.lr.PiecewiseDecay( boundaries=boundaries, values=values) learning_rate = paddle.optimizer.lr.LinearWarmup( learning_rate=learning_rate, warmup_steps=wamup_epochs, start_lr=base_lr / 5., end_lr=base_lr, verbose=True) optimizer = paddle.optimizer.Momentum(learning_rate=learning_rate, weight_decay=weight_decay, momentum=momentum, parameters=parameters) return optimizer # dynamic test device = paddle.set_device('cpu') fluid.enable_dygraph(device) net = MyModel() inputs = [InputSpec([None, 20], 'float32', 'x')] labels = [InputSpec([None, 1], 'int64', 'label')] optim = make_optimizer(net.parameters()) model = Model(net, inputs, labels) model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum")) dataset = MyDataset() lr_scheduler_callback = paddle.callbacks.LRScheduler(by_step=False, by_epoch=True) model.fit(dataset, dataset, batch_size=4, epochs=epochs, num_workers=0, callbacks=lr_scheduler_callback) cnt = 0 for b in boundaries: if b + wamup_epochs <= epochs: cnt += 1 np.testing.assert_allclose(model._optimizer._learning_rate.last_lr, base_lr * (0.1**cnt)) # static test paddle.enable_static() net = MyModel() inputs = [InputSpec([None, 20], 'float32', 'x')] labels = [InputSpec([None, 1], 'int64', 'label')] optim = make_optimizer(net.parameters()) model = Model(net, inputs, labels) model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum")) dataset = MyDataset() lr_scheduler_callback = paddle.callbacks.LRScheduler(by_step=False, by_epoch=True) model.fit(dataset, dataset, batch_size=4, epochs=epochs, num_workers=0, callbacks=lr_scheduler_callback) cnt = 0 for b in boundaries: if b + wamup_epochs <= epochs: cnt += 1 np.testing.assert_allclose(model._optimizer._learning_rate.last_lr, base_lr * (0.1**cnt))
def do_train(args): device = paddle.set_device(args.device) metric_class = METRIC_CLASSES[args.task_name] metric = metric_class() if args.task_name == 'qqp': train_data_loader, dev_data_loader = create_pair_loader_for_small_model( task_name=args.task_name, vocab_path=args.vocab_path, model_name=args.model_name, batch_size=args.batch_size) else: train_data_loader, dev_data_loader = create_data_loader_for_small_model( task_name=args.task_name, vocab_path=args.vocab_path, model_name=args.model_name if args.task_name == 'sst-2' else None, batch_size=args.batch_size) model = BiLSTM(args.emb_dim, args.hidden_size, args.vocab_size, args.output_dim, args.vocab_path, args.padding_idx, args.num_layers, args.dropout_prob, args.init_scale, args.embedding_name) loss_fct = nn.CrossEntropyLoss() if args.optimizer == 'adadelta': optimizer = paddle.optimizer.Adadelta(learning_rate=args.lr, rho=0.95, parameters=model.parameters()) else: optimizer = paddle.optimizer.Adam(learning_rate=args.lr, parameters=model.parameters()) if args.init_from_ckpt: model.set_state_dict(paddle.load(args.init_from_ckpt + ".pdparams")) optimizer.set_state_dict(paddle.load(args.init_from_ckpt + ".pdopt")) print("Loaded checkpoint from %s" % args.init_from_ckpt) global_step = 0 tic_train = time.time() for epoch in range(args.max_epoch): for i, batch in enumerate(train_data_loader): global_step += 1 if args.task_name == 'qqp': input_ids_1, seq_len_1, input_ids_2, seq_len_2, labels = batch logits = model(input_ids_1, seq_len_1, input_ids_2, seq_len_2) else: input_ids, seq_len, labels = batch logits = model(input_ids, seq_len) loss = loss_fct(logits, labels) loss.backward() optimizer.step() optimizer.clear_grad() if global_step % args.log_freq == 0: with paddle.no_grad(): print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.4f step/s" % (global_step, epoch, i, loss, args.log_freq / (time.time() - tic_train))) tic_eval = time.time() acc = evaluate(args.task_name, model, loss_fct, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) tic_train = time.time() if global_step % args.save_steps == 0: paddle.save( model.state_dict(), os.path.join(args.output_dir, "step_" + str(global_step) + ".pdparams")) paddle.save( optimizer.state_dict(), os.path.join(args.output_dir, "step_" + str(global_step) + ".pdopt"))
# Shape: (batch_size, embedding_dim) summed = self.bow_encoder(embedded_text) summed = self.dropout(summed) encoded_text = paddle.tanh(summed) # Shape: (batch_size, hidden_size) fc1_out = paddle.tanh(self.fc1(encoded_text)) # Shape: (batch_size, fc_hidden_size) fc2_out = paddle.tanh(self.fc2(fc1_out)) # Shape: (batch_size, num_classes) logits = self.output_layer(fc2_out) return logits if __name__ == '__main__': paddle.set_device('gpu') if args.use_gpu else paddle.set_device('cpu') # Loads vocab. if not os.path.exists(args.vocab_path): raise RuntimeError('The vocab_path can not be found in the path %s' % args.vocab_path) vocab = data.load_vocab(args.vocab_path) if '[PAD]' not in vocab: vocab['[PAD]'] = len(vocab) # Loads dataset. train_ds, dev_ds, test_ds = ChnSentiCorp.get_datasets( ['train', 'dev', 'test']) # Constructs the newtork. num_classes = len(train_ds.get_labels())
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) train_ds = load_dataset(read_text_pair, data_path=args.train_set_file, lazy=False) # If you wanna use bert/roberta pretrained model, # pretrained_model = ppnlp.transformers.BertModel.from_pretrained('bert-base-chinese') # pretrained_model = ppnlp.transformers.RobertaModel.from_pretrained('roberta-wwm-ext') pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained( 'ernie-1.0') # If you wanna use bert/roberta pretrained model, # tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese') # tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext') tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0') trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # query_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # query_segment Pad(axis=0, pad_val=tokenizer.pad_token_id), # title_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # tilte_segment ): [data for data in fn(samples)] train_data_loader = create_dataloader(train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) model = SemanticIndexBatchNeg(pretrained_model, margin=args.margin, scale=args.scale, output_emb_size=args.output_emb_size) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) print("warmup from:{}".format(args.init_from_ckpt)) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch loss = model(query_input_ids=query_input_ids, title_input_ids=title_input_ids, query_token_type_ids=query_token_type_ids, title_token_type_ids=title_token_type_ids) global_step += 1 if global_step % 10 == 0 and rank == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, 10 / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 and rank == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) save_param_path = os.path.join(save_dir, 'model_state.pdparams') paddle.save(model.state_dict(), save_param_path) tokenizer.save_pretrained(save_dir)