Esempio n. 1
0
 def load(filename, **kwargs):
     state = torch.load(os.path.join(filename, 'model.pt'))
     model.load_state_dict(state['model'])
     if 'optimizer' in state and optimizer:
         optimizer.load_state_dict(state['optimizer'])
     with open(os.path.join(filename, 'class.pkl'), 'rb') as fp:
         temp_class = pickle.load(fp)
     nsml.copy(temp_class, class_to_save)
     print('Model loaded')
Esempio n. 2
0
    def load(dir_name, *args, **kwargs):
        state = torch.load(os.path.join(dir_name, 'model.pt'))
        model.load_state_dict(state)

        temp_my_args = torch.load(os.path.join(dir_name, "my_args.bin"))
        nsml.copy(temp_my_args, my_args)

        temp_tokenizer = torch.load(os.path.join(dir_name, 'tokenizer'))
        nsml.copy(temp_tokenizer, tokenizer)

        logger.info("Load model & tokenizer & args from {}".format(dir_name))
def main():

    parser = argparse.ArgumentParser()
    # Required parameters, we defined additional arguments for experiment
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name",
    )
    parser.add_argument(
        "--load_cache",
        action="store_true",
        help="load data from cached session",
    )
    parser.add_argument(
        "--save_cache",
        action="store_true",
        help="save loaded dataset into cache"
    )
    parser.add_argument(
        "--cached_session_pretrain",
        default="",
        type=str,
        help="Path to cache where 'Span-Pretraining' dataset is stored",
    )
    parser.add_argument(
        "--cached_session_pretrain_qa",
        default="",
        type=str,
        help="Path to cache where 'QA-Pretraining' dataset is stored",
    )
    parser.add_argument(
        "--cached_session_train",
        default="",
        type=str,
        help="Path to cache where given 'training' dataset is stored",
    )
    parser.add_argument(
        "--cached_session_dev",
        default="",
        type=str,
        help="Path to cache where given 'development set' is stored",
    )
    parser.add_argument(
        "--load_model",
        action="store_true",
        help="use pretrained model from previous sessions",
    )   
    parser.add_argument(
        "--load_model_session",
        default="",
        type=str,
        help="Path to pre-trained model",
    )
    parser.add_argument(
        "--load_model_checkpoint",
        default="",
        type=str,
        help="Path to pre-trained model",
    )    
    parser.add_argument(
        "--just_for_save",
        action="store_true",
        help="save checkpoint and terminate immediately",
    )
    parser.add_argument(
        "--freeze_embedding",
        action="store_true",
        help="finetuning just classification layer",
    ) 
    parser.add_argument(
        "--mix_qa",
        action="store_true",
        help="mix qa set for variance",
    )
    parser.add_argument(
        "--mix_portion",
        type=float,
        default=0.5,
        help="defines portion of qa pairs to be reconstructed"
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints and predictions will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        help="The input data dir. Should contain the .json files for the task."
             + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--train_file",
        default=None,
        type=str,
        help="The input training file. If a data dir is specified, will look for the file there"
             + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help="The input evaluation file. If a data dir is specified, will look for the file there"
             + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )

    parser.add_argument(
        "--version_2_with_negative",
        action="store_true",
        help="If true, the SQuAD examples contain some that do not have an answer.",
    )
    parser.add_argument(
        "--null_score_diff_threshold",
        type=float,
        default=0.0,
        help="If null_score - best_non_null is greater than the threshold predict null.",
    )

    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
             "longer than this will be truncated, and sequences shorter than this will be padded.",
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help="When splitting up a long document into chunks, how much stride to take between chunks.",
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help="The maximum number of tokens for the question. Questions longer than this will "
             "be truncated to this length.",
    )
    parser.add_argument("--do_pretrain_span", action="store_true", help="Whether to run span-pretraining.")
    parser.add_argument("--do_pretrain_qa", action="store_true", help="Whether to run qa-pretraining.")
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training", default=True,
        action="store_true", help="Run evaluation during training at each logging step."
    )
    parser.add_argument("--do_initial_validation", action="store_true", help="Whether to run initial validation")
    parser.add_argument(
        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
    )

    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
    parser.add_argument(
        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
    )
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
    )
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help="The maximum length of an answer that can be generated. This is needed because the start "
             "and end predictions are not conditioned on one another.",
    )
    parser.add_argument(
        "--verbose_logging",
        action="store_true",
        help="If true, all of the warnings related to data processing will be printed. "
             "A number of warnings are expected for a normal SQuAD evaluation.",
    )

    parser.add_argument("--logging_steps", type=int, default=100, help="Log every X updates steps.")
    parser.add_argument("--save_steps", type=int, default=1000, help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
    )
    parser.add_argument(
        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
             "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")

    parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features")

    ### DO NOT MODIFY THIS BLOCK ###
    # arguments for nsml
    parser.add_argument('--pause', type=int, default=0)
    parser.add_argument('--mode', type=str, default='train')
    ################################

    args = parser.parse_args()

    # for NSML
    args.data_dir = os.path.join(DATASET_PATH, args.data_dir)

    if (
            os.path.exists(args.output_dir)
            and os.listdir(args.output_dir)
            and args.do_train
            and not args.overwrite_output_dir
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
        filename='log.log'
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    args.model_type = args.model_type.lower()

    tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-finetuned-korquad")
    # tokenizer.add_special_tokens({"additional_special_tokens" : ["[QUES]"]})
    # print("vocabsize: {}".format(tokenizer.vocab_size))
    # print("example")
    # print(tokenizer.tokenize("[CLS] 한국어 ELECTRA를 공유합니다. [SEP]"))
    model = ElectraForQuestionAnswering.from_pretrained("monologg/koelectra-base-v3-finetuned-korquad")

    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is
    # set. Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running
    # `--fp16_opt_level="O2"` will remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex

            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")


    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")

        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu >  0:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    model.to(args.device)


    ### DO NOT MODIFY THIS BLOCK ###
    if IS_ON_NSML:
        bind_nsml(model, tokenizer, args)
        if args.pause:
            nsml.paused(scope=locals())
    ################################

    logger.info("Training/evaluation parameters %s", args) 




    # bind_nsml(model, tokenizer, args)

    if args.load_model:
        tmp_args = parser.parse_args()
        nsml.copy(args, tmp_args)
        nsml.load(checkpoint=args.load_model_checkpoint, session=args.load_model_session)
        nsml.copy(tmp_args, args)
    
    if args.just_for_save:
        nsml.save("test")
        return

    # initial validation
    if args.do_initial_validation:
        logger.info("Initinal Validation start")
        result = evaluate(args, model, tokenizer, prefix="")
        _f1, _exact = result["f1"], result["exact"]

        logger.info(
            "f1_val = {}, exact_val = {}" \
            .format(_f1, _exact))
        if IS_ON_NSML:
            nsml.report(summary=True, step=0, f1=_f1, exact=_exact)

    # 'Span' Pretraining
    if args.do_pretrain_span:
        t = time.time()
        train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False, is_pretrain=True, qa_style=False)
        t = time.time() - t
        logger.info("loading pretrain data takes {:.3f} seconds".format(t))
        global_step, tr_loss = train(args, train_dataset, model, tokenizer, is_pretrain=True)
        logger.info(" pretrain_global_step = %s, pretrain_average loss = %s", global_step, tr_loss)
        nsml.save("pretrained_span")

    # 'QA' Pretraining
    if args.do_pretrain_qa:
        t = time.time()
        train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False, is_pretrain=True, qa_style=True)
        t = time.time() - t
        logger.info("loading pretrain data takes {:.3f} seconds".format(t))
        global_step, tr_loss = train(args, train_dataset, model, tokenizer, is_pretrain=True)
        logger.info(" pretrain_global_step = %s, pretrain_average loss = %s", global_step, tr_loss)
        nsml.save("pretrained_span+qa")

    # Training
    if args.do_train:
        if args.freeze_embedding:
            for param in model.module.electra.parameters():
                param.requires_grad = False
        t = time.time()
        train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False)
        t = time.time() - t
        logger.info("loading train data takes {:.3f} seconds".format(t))
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 def load_data(dir_name):
     tmp = torch.load(os.path.join(dir_name, '{}.pt'.format(cached_features_file)))
     print(tmp.keys())
     nsml.copy(tmp, features_and_datasets)