def load(filename, **kwargs): state = torch.load(os.path.join(filename, 'model.pt')) model.load_state_dict(state['model']) if 'optimizer' in state and optimizer: optimizer.load_state_dict(state['optimizer']) with open(os.path.join(filename, 'class.pkl'), 'rb') as fp: temp_class = pickle.load(fp) nsml.copy(temp_class, class_to_save) print('Model loaded')
def load(dir_name, *args, **kwargs): state = torch.load(os.path.join(dir_name, 'model.pt')) model.load_state_dict(state) temp_my_args = torch.load(os.path.join(dir_name, "my_args.bin")) nsml.copy(temp_my_args, my_args) temp_tokenizer = torch.load(os.path.join(dir_name, 'tokenizer')) nsml.copy(temp_tokenizer, tokenizer) logger.info("Load model & tokenizer & args from {}".format(dir_name))
def main(): parser = argparse.ArgumentParser() # Required parameters, we defined additional arguments for experiment parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name", ) parser.add_argument( "--load_cache", action="store_true", help="load data from cached session", ) parser.add_argument( "--save_cache", action="store_true", help="save loaded dataset into cache" ) parser.add_argument( "--cached_session_pretrain", default="", type=str, help="Path to cache where 'Span-Pretraining' dataset is stored", ) parser.add_argument( "--cached_session_pretrain_qa", default="", type=str, help="Path to cache where 'QA-Pretraining' dataset is stored", ) parser.add_argument( "--cached_session_train", default="", type=str, help="Path to cache where given 'training' dataset is stored", ) parser.add_argument( "--cached_session_dev", default="", type=str, help="Path to cache where given 'development set' is stored", ) parser.add_argument( "--load_model", action="store_true", help="use pretrained model from previous sessions", ) parser.add_argument( "--load_model_session", default="", type=str, help="Path to pre-trained model", ) parser.add_argument( "--load_model_checkpoint", default="", type=str, help="Path to pre-trained model", ) parser.add_argument( "--just_for_save", action="store_true", help="save checkpoint and terminate immediately", ) parser.add_argument( "--freeze_embedding", action="store_true", help="finetuning just classification layer", ) parser.add_argument( "--mix_qa", action="store_true", help="mix qa set for variance", ) parser.add_argument( "--mix_portion", type=float, default=0.5, help="defines portion of qa pairs to be reconstructed" ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.", ) # Other parameters parser.add_argument( "--data_dir", default=None, type=str, help="The input data dir. Should contain the .json files for the task." + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--train_file", default=None, type=str, help="The input training file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--predict_file", default=None, type=str, help="The input evaluation file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--version_2_with_negative", action="store_true", help="If true, the SQuAD examples contain some that do not have an answer.", ) parser.add_argument( "--null_score_diff_threshold", type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.", ) parser.add_argument( "--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.", ) parser.add_argument( "--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.", ) parser.add_argument( "--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.", ) parser.add_argument("--do_pretrain_span", action="store_true", help="Whether to run span-pretraining.") parser.add_argument("--do_pretrain_qa", action="store_true", help="Whether to run qa-pretraining.") parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", default=True, action="store_true", help="Run evaluation during training at each logging step." ) parser.add_argument("--do_initial_validation", action="store_true", help="Whether to run initial validation") parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." ) parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." ) parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument( "--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json output file.", ) parser.add_argument( "--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.", ) parser.add_argument( "--verbose_logging", action="store_true", help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.", ) parser.add_argument("--logging_steps", type=int, default=100, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=1000, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features") ### DO NOT MODIFY THIS BLOCK ### # arguments for nsml parser.add_argument('--pause', type=int, default=0) parser.add_argument('--mode', type=str, default='train') ################################ args = parser.parse_args() # for NSML args.data_dir = os.path.join(DATASET_PATH, args.data_dir) if ( os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir ) ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, filename='log.log' ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() args.model_type = args.model_type.lower() tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-finetuned-korquad") # tokenizer.add_special_tokens({"additional_special_tokens" : ["[QUES]"]}) # print("vocabsize: {}".format(tokenizer.vocab_size)) # print("example") # print(tokenizer.tokenize("[CLS] 한국어 ELECTRA를 공유합니다. [SEP]")) model = ElectraForQuestionAnswering.from_pretrained("monologg/koelectra-base-v3-finetuned-korquad") if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is # set. Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running # `--fp16_opt_level="O2"` will remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 0: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) model.to(args.device) ### DO NOT MODIFY THIS BLOCK ### if IS_ON_NSML: bind_nsml(model, tokenizer, args) if args.pause: nsml.paused(scope=locals()) ################################ logger.info("Training/evaluation parameters %s", args) # bind_nsml(model, tokenizer, args) if args.load_model: tmp_args = parser.parse_args() nsml.copy(args, tmp_args) nsml.load(checkpoint=args.load_model_checkpoint, session=args.load_model_session) nsml.copy(tmp_args, args) if args.just_for_save: nsml.save("test") return # initial validation if args.do_initial_validation: logger.info("Initinal Validation start") result = evaluate(args, model, tokenizer, prefix="") _f1, _exact = result["f1"], result["exact"] logger.info( "f1_val = {}, exact_val = {}" \ .format(_f1, _exact)) if IS_ON_NSML: nsml.report(summary=True, step=0, f1=_f1, exact=_exact) # 'Span' Pretraining if args.do_pretrain_span: t = time.time() train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False, is_pretrain=True, qa_style=False) t = time.time() - t logger.info("loading pretrain data takes {:.3f} seconds".format(t)) global_step, tr_loss = train(args, train_dataset, model, tokenizer, is_pretrain=True) logger.info(" pretrain_global_step = %s, pretrain_average loss = %s", global_step, tr_loss) nsml.save("pretrained_span") # 'QA' Pretraining if args.do_pretrain_qa: t = time.time() train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False, is_pretrain=True, qa_style=True) t = time.time() - t logger.info("loading pretrain data takes {:.3f} seconds".format(t)) global_step, tr_loss = train(args, train_dataset, model, tokenizer, is_pretrain=True) logger.info(" pretrain_global_step = %s, pretrain_average loss = %s", global_step, tr_loss) nsml.save("pretrained_span+qa") # Training if args.do_train: if args.freeze_embedding: for param in model.module.electra.parameters(): param.requires_grad = False t = time.time() train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False) t = time.time() - t logger.info("loading train data takes {:.3f} seconds".format(t)) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
def load_data(dir_name): tmp = torch.load(os.path.join(dir_name, '{}.pt'.format(cached_features_file))) print(tmp.keys()) nsml.copy(tmp, features_and_datasets)