def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the training files for the NER/POS task.") parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument("--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--labels", default="", type=str, help="Path to a file containing all labels. If not specified, NER/POS labels are used.") parser.add_argument("--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument("--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument("--cache_dir", default=None, type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") parser.add_argument("--do_predict_dev", action="store_true", help="Whether to run predictions on the dev set.") parser.add_argument("--do_predict_train", action="store_true") parser.add_argument("--init_checkpoint", default=None, type=str, help="initial checkpoint for train/predict") parser.add_argument("--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.") parser.add_argument("--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--few_shot", default=-1, type=int, help="num of few-shot exampes") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument("--save_only_best_checkpoint", action="store_true", help="Save only the best checkpoint during training") parser.add_argument("--eval_all_checkpoints", action="store_true", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument("--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--fp16", action="store_true", help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") parser.add_argument("--fp16_opt_level", type=str, default="O1", help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") parser.add_argument("--predict_langs", type=str, default="en", help="prediction languages") parser.add_argument("--train_langs", default="en", type=str, help="The languages in the training sets.") parser.add_argument("--log_file", type=str, default=None, help="log file") parser.add_argument("--eval_patience", type=int, default=-1, help="wait N times of decreasing dev score before early stop during training") ## SDE parameters parser.add_argument("--max_ngram_size", default=10, type=int, help="ngram size for each word") parser.add_argument("--bpe_segment", type=int, default=1, help="whether to segment by BPE or by word") parser.add_argument("--sde_latent", type=int, default=5000, help="sde latent emb size") parser.add_argument("--use_sde_embed", action="store_true") parser.add_argument("--add_sde_embed", action="store_true") parser.add_argument("--tau", type=float, default=-1, help="wait N times of decreasing dev score before early stop during training") parser.add_argument("--attention_t", type=float, default=1, help="wait N times of decreasing dev score before early stop during training") parser.add_argument("--mlm_weight", type=float, default=-1, help="wait N times of decreasing dev score before early stop during training") parser.add_argument("--mlm_lang", type=str, default='ur', help="wait N times of decreasing dev score before early stop during training") parser.add_argument("--mlm_start_epoch", type=int, default=0, help="wait N times of decreasing dev score before early stop during training") parser.add_argument("--mlm_end_epoch", type=int, default=0, help="wait N times of decreasing dev score before early stop during training") parser.add_argument("--update_pretrained_epoch", type=int, default=0, help="wait N times of decreasing dev score before early stop during training") parser.add_argument("--bpe_dropout", default=0, type=float) parser.add_argument("--resample_dataset", default=0, type=float, help="set to 1 if resample at each epoch") parser.add_argument("--fix_class", action='store_true') # RecAdam parameters parser.add_argument("--optimizer", type=str, default="RecAdam", choices=["Adam", "RecAdam"], help="Choose the optimizer to use. Default RecAdam.") parser.add_argument("--recadam_anneal_fun", type=str, default='sigmoid', choices=["sigmoid", "linear", 'constant'], help="the type of annealing function in RecAdam. Default sigmoid") parser.add_argument("--recadam_anneal_k", type=float, default=0.5, help="k for the annealing function in RecAdam.") parser.add_argument("--recadam_anneal_t0", type=int, default=250, help="t0 for the annealing function in RecAdam.") parser.add_argument("--recadam_anneal_w", type=float, default=1.0, help="Weight for the annealing function in RecAdam. Default 1.0.") parser.add_argument("--recadam_pretrain_cof", type=float, default=5000.0, help="Coefficient of the quadratic penalty in RecAdam. Default 5000.0.") parser.add_argument("--logging_Euclid_dist", action="store_true", help="Whether to log the Euclidean distance between the pretrained model and fine-tuning model") parser.add_argument("--start_from_pretrain", action="store_true", help="Whether to initialize the model with pretrained parameters") parser.add_argument("--albert_dropout", default=0.0, type=float, help="The dropout rate for the ALBERT model") parser.add_argument("--few_shot_extra_langs", type=str, default=None) parser.add_argument("--few_shot_extra_langs_size", type=str, default=None) args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which sychronizes nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig(handlers = [logging.FileHandler(args.log_file), logging.StreamHandler()], format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logging.info("Input args: %r" % args) logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Prepare NER/POS task labels = get_labels(args.labels) num_labels = len(labels) # Use cross entropy ignore index as padding label id # so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer # Make sure only the first process in distributed training loads model/vocab if args.local_rank not in [-1, 0]: torch.distributed.barrier() args.model_type = args.model_type.lower() if args.mlm_weight > 0: #args.model_type = args.model_type+"_mlm" config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type+"_mlm"] else: config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, use_sde_embed=args.use_sde_embed, add_sde_embed=args.add_sde_embed, sde_latent=args.sde_latent, mlm_weight=args.mlm_weight, attention_t=args.attention_t, fix_class=args.fix_class, cache_dir=args.cache_dir if args.cache_dir else None) tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) if args.optimizer == 'RecAdam': pretrained_model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) pretrained_model.to(args.device) else: pretrained_model = None if args.init_checkpoint: logger.info("loading from init_checkpoint={}".format(args.init_checkpoint)) model = model_class.from_pretrained(args.init_checkpoint, config=config, cache_dir=args.init_checkpoint) else: logger.info("loading from cached model = {}".format(args.model_name_or_path)) model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) lang2id = config.lang2id if args.model_type == "xlm" else None logger.info("Using lang2id = {}".format(lang2id)) # Make sure only the first process in distributed training loads model/vocab if args.local_rank == 0: torch.distributed.barrier() model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train", lang=args.train_langs, lang2id=lang2id, few_shot=args.few_shot) global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id, lang2id, pretrained_model=pretrained_model) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use default names for the model, # you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) # Save model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` # Take care of distributed/parallel training logger.info("Saving model checkpoint to %s", args.output_dir) model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Initialization for evaluation results = {} if args.init_checkpoint: best_checkpoint = args.init_checkpoint elif os.path.exists(os.path.join(args.output_dir, 'checkpoint-best')): best_checkpoint = os.path.join(args.output_dir, 'checkpoint-best') else: best_checkpoint = args.output_dir best_f1 = 0 # Evaluation if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step, lang=args.train_langs, lang2id=lang2id) if result["f1"] > best_f1: best_checkpoint = checkpoint best_f1 = result["f1"] if global_step: result = {"{}_{}".format(global_step, k): v for k, v in result.items()} results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) writer.write("best checkpoint = {}, best f1 = {}\n".format(best_checkpoint, best_f1)) # Prediction if args.do_predict and args.local_rank in [-1, 0]: logger.info("Loading the best checkpoint from {}\n".format(best_checkpoint)) tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(best_checkpoint) model.to(args.device) output_test_results_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_results_file, "a") as result_writer: for lang in args.predict_langs.split(','): if not os.path.exists(os.path.join(args.data_dir, lang, 'test.{}'.format(args.model_name_or_path))): logger.info("Language {} does not exist".format(lang)) continue result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test", lang=lang, lang2id=lang2id) # Save results result_writer.write("=====================\nlanguage={}\n".format(lang)) for key in sorted(result.keys()): result_writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join(args.output_dir, "test_{}_predictions.txt".format(lang)) infile = os.path.join(args.data_dir, lang, "test.{}".format(args.model_name_or_path)) idxfile = infile + '.idx' save_predictions(args, predictions, output_test_predictions_file, infile, idxfile) if args.do_predict_train and args.local_rank in [-1, 0]: logger.info("Loading the best checkpoint from {}\n".format(best_checkpoint)) tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(best_checkpoint) model.to(args.device) output_test_results_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_results_file, "a") as result_writer: for lang in args.predict_langs.split(','): if not os.path.exists(os.path.join(args.data_dir, lang, 'train.{}'.format(args.model_name_or_path))): logger.info("Language {} does not exist".format(lang)) continue result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="train", lang=lang, lang2id=lang2id) # Save results result_writer.write("=====================\nlanguage={}\n".format(lang)) for key in sorted(result.keys()): result_writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join(args.output_dir, "train_{}_predictions.txt".format(lang)) infile = os.path.join(args.data_dir, lang, "train.{}".format(args.model_name_or_path)) idxfile = infile + '.idx' save_predictions(args, predictions, output_test_predictions_file, infile, idxfile, output_word_prediction=True) # Predict dev set if args.do_predict_dev and args.local_rank in [-1, 0]: logger.info("Loading the best checkpoint from {}\n".format(best_checkpoint)) tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(best_checkpoint) model.to(args.device) output_test_results_file = os.path.join(args.output_dir, "dev_results.txt") with open(output_test_results_file, "w") as result_writer: for lang in args.predict_langs.split(','): if not os.path.exists(os.path.join(args.data_dir, lang, 'dev.{}'.format(args.model_name_or_path))): logger.info("Language {} does not exist".format(lang)) continue result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", lang=lang, lang2id=lang2id) # Save results result_writer.write("=====================\nlanguage={}\n".format(lang)) for key in sorted(result.keys()): result_writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join(args.output_dir, "dev_{}_predictions.txt".format(lang)) infile = os.path.join(args.data_dir, lang, "dev.{}".format(args.model_name_or_path)) idxfile = infile + '.idx' save_predictions(args, predictions, output_test_predictions_file, infile, idxfile)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the training files for the NER/POS task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--labels", default="", type=str, help= "Path to a file containing all labels. If not specified, NER/POS labels are used." ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default=None, type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") parser.add_argument("--do_predict_dev", action="store_true", help="Whether to run predictions on the dev set.") parser.add_argument("--init_checkpoint", default=None, type=str, help="initial checkpoint for train/predict") parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--few_shot", default=-1, type=int, help="num of few-shot exampes") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument("--save_only_best_checkpoint", action="store_true", help="Save only the best checkpoint during training") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") parser.add_argument("--predict_langs", type=str, default="en", help="prediction languages") parser.add_argument("--train_langs", default="en", type=str, help="The languages in the training sets.") parser.add_argument("--log_file", type=str, default=None, help="log file") parser.add_argument( "--eval_patience", type=int, default=-1, help= "wait N times of decreasing dev score before early stop during training" ) args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which sychronizes nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( handlers=[logging.FileHandler(args.log_file), logging.StreamHandler()], format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logging.info("Input args: %r" % args) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Prepare NER/POS task labels = get_labels(args.labels) num_labels = len(labels) # Use cross entropy ignore index as padding label id # so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer # Make sure only the first process in distributed training loads model/vocab if args.local_rank not in [-1, 0]: torch.distributed.barrier() args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] # config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, # num_labels=num_labels, # cache_dir=args.cache_dir if args.cache_dir else None) config_path = "/blob/kaiyuan-result/token-level/thisistheother" config = config_class.from_pretrained( config_path, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) # if args.init_checkpoint: # logger.info("loading from init_checkpoint={}".format(args.init_checkpoint)) # model = model_class.from_pretrained(args.init_checkpoint, # config=config, # cache_dir=args.init_checkpoint) # else: # logger.info("loading from cached model = {}".format(args.model_name_or_path)) # model = model_class.from_pretrained(args.model_name_or_path, # from_tf=bool(".ckpt" in args.model_name_or_path), # config=config, # cache_dir=args.cache_dir if args.cache_dir else None) model = XLMForTokenClassification(config) model.from_pretrained( model_path="/blob/kaiyuan-result/token-level/kaiyuan-1230-bfpos-80.pth", config=config, ) lang2id = config.lang2id if args.model_type == "xlm" else None logger.info("Using lang2id = {}".format(lang2id)) # Make sure only the first process in distributed training loads model/vocab if args.local_rank == 0: torch.distributed.barrier() model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train", lang=args.train_langs, lang2id=lang2id, few_shot=args.few_shot) global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id, lang2id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use default names for the model, # you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) # Save model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` # Take care of distributed/parallel training # logger.info("Saving model checkpoint to %s", args.output_dir) # torch.save(model.state_dict(), 'save.pt') tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the model # torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Initialization for evaluation results = {} if args.init_checkpoint: best_checkpoint = args.init_checkpoint elif os.path.exists(os.path.join(args.output_dir, 'checkpoint-best.pth')): best_checkpoint = os.path.join(args.output_dir, 'checkpoint-best.pth') else: best_checkpoint = args.output_dir best_f1 = 0 # Evaluation if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.load_state_dict(torch.load(best_checkpoint)) model.to(args.device) result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step, lang=args.train_langs, lang2id=lang2id) results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) writer.write("best checkpoint = {}, best f1 = {}\n".format( best_checkpoint, best_f1)) # Prediction if args.do_predict and args.local_rank in [-1, 0]: logger.info( "Loading the best checkpoint from {}\n".format(best_checkpoint)) # tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) # model = model_class.from_pretrained(best_checkpoint) # model.to(args.device) output_test_results_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_results_file, "a") as result_writer: for lang in args.predict_langs.split(','): if not os.path.exists( os.path.join(args.data_dir, lang, 'test.{}'.format( args.model_name_or_path))): logger.info("Language {} does not exist".format(lang)) continue result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test", lang=lang, lang2id=lang2id) # Save results result_writer.write( "=====================\nlanguage={}\n".format(lang)) for key in sorted(result.keys()): result_writer.write("{} = {}\n".format( key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join( args.output_dir, "test_{}_predictions.txt".format(lang)) infile = os.path.join( args.data_dir, lang, "test.{}".format(args.model_name_or_path)) idxfile = infile + '.idx' save_predictions(args, predictions, output_test_predictions_file, infile, idxfile) # Predict dev set if args.do_predict_dev and args.local_rank in [-1, 0]: # logger.info("Loading the best checkpoint from {}\n".format(best_checkpoint)) # tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) # model = model_class.from_pretrained(best_checkpoint) # model.to(args.device) output_test_results_file = os.path.join(args.output_dir, "dev_results.txt") with open(output_test_results_file, "w") as result_writer: for lang in args.predict_langs.split(','): if not os.path.exists( os.path.join(args.data_dir, lang, 'dev.{}'.format( args.model_name_or_path))): logger.info("Language {} does not exist".format(lang)) continue result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", lang=lang, lang2id=lang2id) # Save results result_writer.write( "=====================\nlanguage={}\n".format(lang)) for key in sorted(result.keys()): result_writer.write("{} = {}\n".format( key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join( args.output_dir, "dev_{}_predictions.txt".format(lang)) infile = os.path.join(args.data_dir, lang, "dev.{}".format(args.model_name_or_path)) idxfile = infile + '.idx' save_predictions(args, predictions, output_test_predictions_file, infile, idxfile)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the training files for the NER/POS task." ) parser.add_argument("--dataset_name", required=True, type=str, help="Name of the dataset") parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument("--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--labels", default="", type=str, help= "Path to a file containing all labels. If not specified, NER/POS labels are used." ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default=None, type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") parser.add_argument("--do_predict_dev", action="store_true", help="Whether to run predictions on the dev set.") parser.add_argument("--init_checkpoint", default=None, type=str, help="initial checkpoint for train/predict") parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--few_shot", default=-1, type=int, help="num of few-shot exampes") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument("--save_only_best_checkpoint", action="store_true", help="Save only the best checkpoint during training") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--workers", type=int, default=20, help="number of workers for data loading.") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") parser.add_argument("--log_file", type=str, default=None, help="log file") parser.add_argument( "--eval_patience", type=int, default=-1, help= "wait N times of decreasing dev score before early stop during training" ) args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which sychronizes nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( handlers=[logging.FileHandler(args.log_file), logging.StreamHandler()], format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logging.info("Input args: %r" % args) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Prepare NER/POS task labels = get_labels() num_labels = len(labels) # Use cross entropy ignore index as padding label id # so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer # Make sure only the first process in distributed training loads model/vocab if args.local_rank not in [-1, 0]: torch.distributed.barrier() args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, cache_dir=args.cache_dir if args.cache_dir else None) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) if args.init_checkpoint: logger.info("loading from init_checkpoint={}".format( args.init_checkpoint)) model = model_class.from_pretrained(args.init_checkpoint, config=config, cache_dir=args.init_checkpoint) else: logger.info("loading from cached model = {}".format( args.model_name_or_path)) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) # Make sure only the first process in distributed training loads model/vocab if args.local_rank == 0: torch.distributed.barrier() model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train", few_shot=args.few_shot) global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use default names for the model, # you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) # Save model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` # Take care of distributed/parallel training logger.info("Saving model checkpoint to %s", args.output_dir) model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Initialization for evaluation results = {} if args.init_checkpoint: best_checkpoint = args.init_checkpoint elif os.path.exists(os.path.join(args.output_dir, 'checkpoint-best')): best_checkpoint = os.path.join(args.output_dir, 'checkpoint-best') else: best_checkpoint = args.output_dir best_f1 = 0 # Evaluation if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel( logging.WARN) logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="valid", prefix=global_step) if result["f1"] > best_f1: best_checkpoint = checkpoint best_f1 = result["f1"] if global_step: result = { "{}_{}".format(global_step, k): v for k, v in result.items() } results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) writer.write("best checkpoint = {}, best f1 = {}\n".format( best_checkpoint, best_f1)) # Prediction if args.do_predict and args.local_rank in [-1, 0]: logger.info( "Loading the best checkpoint from {}\n".format(best_checkpoint)) tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(best_checkpoint) model.to(args.device) if args.dataset_name: output_test_results_file = os.path.join( args.output_dir, "{}_results.txt".format(args.dataset_name)) output_test_predictions_file = os.path.join( args.output_dir, "{}_predictions.txt".format(args.dataset_name)) else: output_test_results_file = os.path.join(args.output_dir, "test_results.txt") output_test_predictions_file = os.path.join( args.output_dir, "test_predictions.txt") with open(output_test_results_file, "w") as result_writer: result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test", print_result=False) # Save results logger.info("***** Test results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) result_writer.write("%s = %s\n" % (key, value)) # Save predictions with open(output_test_predictions_file, "w") as writer: with open(os.path.join(args.data_dir, "test.txt"), "r") as f: example_id = 0 for line in f: ex = json.loads(line.strip()) # note. we may truncate source assert len(ex['source']) >= len( predictions[example_id]) keyphrases = [] kp_tokens = [] for idx, tag in enumerate(predictions[example_id]): if tag == 'B': if len(kp_tokens) != 0: # save the previous keyphrase keyphrases.append(' '.join(kp_tokens)) kp_tokens = [] kp_tokens.append(ex['source'][idx]) elif tag == 'I': # note. model may predict I tag which is not preceded by B tag # assert len(kp_tokens) > 0 kp_tokens.append(ex['source'][idx]) else: if len(kp_tokens) != 0: keyphrases.append(' '.join(kp_tokens)) kp_tokens = [] if len(kp_tokens) != 0: keyphrases.append(' '.join(kp_tokens)) # removing duplicates kps = [] [kps.append(kp) for kp in keyphrases if kp not in kps] writer.write(';'.join(kps) + '\n') example_id += 1 # Predict dev set if args.do_predict_dev and args.local_rank in [-1, 0]: logger.info( "Loading the best checkpoint from {}\n".format(best_checkpoint)) tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(best_checkpoint) model.to(args.device) output_test_results_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_test_results_file, "w") as result_writer: result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="valid") # Save results logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) result_writer.write("%s = %s\n" % (key, value))