def main(): logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) args = get_args() log_info.print_args(args) device, n_gpu = initialization.init_cuda_from_args(args, logger=logger) initialization.init_seed(args, n_gpu=n_gpu, logger=logger) initialization.init_train_batch_size(args) initialization.init_output_dir(args) initialization.save_args(args) task = get_task(args.task_name, args.data_dir) # prepare examples, load model as encoder tokenizer = shared_model_setup.create_tokenizer( bert_model_name=args.bert_model, bert_load_mode=args.bert_load_mode, do_lower_case=args.do_lower_case, bert_vocab_path=args.bert_vocab_path, ) all_state = shared_model_setup.load_overall_state(args.bert_load_path, relaxed=True) # Load Model... if args.bert_load_mode == "state_model_only": state_dict = all_state['model'] bert_as_encoder = BertModel.from_state_dict( config_file=args.bert_config_json_path, state_dict=state_dict) else: assert args.bert_load_mode == "from_pretrained" cache_dir = PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format( args.local_rank) bert_as_encoder = BertModel.from_pretrained( pretrained_model_name_or_path=args.bert_model, cache_dir=cache_dir) bert_as_encoder.to(device) runner_param = RunnerParameters( max_seq_length=args.max_seq_length, local_rank=args.local_rank, n_gpu=n_gpu, fp16=args.fp16, learning_rate=args.learning_rate, gradient_accumulation_steps=args.gradient_accumulation_steps, t_total=None, warmup_proportion=args.warmup_proportion, num_train_epochs=args.num_train_epochs, train_batch_size=args.train_batch_size, eval_batch_size=args.eval_batch_size, ) runner = EmbeddingTaskRunner(bert_model=bert_as_encoder, optimizer=None, tokenizer=tokenizer, label_list=task.get_labels(), device=device, rparams=runner_param) # Run training set encoding... print("Run training set encoding ... ") train_examples = task.get_train_examples() train_dataset = runner.run_encoding(train_examples, verbose=True, mode='train') print("saving embeddings ... ") torch.save(train_dataset, os.path.join(args.output_dir, "train.dataset")) # Run development set encoding ... eval_examples = task.get_dev_examples() eval_dataset = runner.run_encoding(eval_examples, verbose=True, mode='eval') print("saving embeddings ... ") torch.save(eval_dataset, os.path.join(args.output_dir, 'dev.dataset')) # Run test set encoding ... test_examples = task.get_test_examples() test_dataset = runner.run_encoding(test_examples, verbose=True, mode='test') print("saving embeddings ... ") torch.save(test_dataset, os.path.join(args.output_dir, "test.dataset")) # HACK for MNLI mis-matched set ... if args.task_name == 'mnli': print("=== Start embedding task for MNLI mis-matched ===") mm_eval_examples = MnliMismatchedProcessor().get_dev_examples( task.data_dir) mm_eval_dataset = runner.run_encoding(mm_eval_examples, verbose=True, mode='eval') print("=== Saving eval dataset ===") torch.save(mm_eval_dataset, os.path.join(args.output_dir, "mm_dev.dataset")) print("=== Saved ===") mm_test_examples = MnliMismatchedProcessor().get_test_examples( task.data_dir) mm_test_dataset = runner.run_encoding(mm_test_examples, verbose=True, mode='test') print("=== Saving tensor dataset ===") torch.save(mm_test_dataset, os.path.join(args.output_dir, "mm_test.dataset")) print("=== Saved ===")
def main(): logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) args = get_args() log_info.print_args(args) device, n_gpu = initialization.init_cuda_from_args(args, logger=logger) initialization.init_seed(args, n_gpu=n_gpu, logger=logger) initialization.init_train_batch_size(args) initialization.init_output_dir(args) # could cause problem because imdb is not part of the tasks defined task = get_task(args.task_name, args.data_dir) # create tokenizer using given model input # I think xlnet also use the same tokenizer tokenizer = shared_model_setup.create_tokenizer( xlnet_model_name=args.xlnet_model, # need to change xlnet_load_mode=args.xlnet_load_mode, # need to change do_lower_case=args.do_lower_case, xlnet_vocab_path=args.xlnet_vocab_path, # not sure how to modify ) all_state = shared_model_setup.load_overall_state( args.xlnet_load_path, relaxed=True) # probably will be the pre-trained one model = glue_model_setup.create_model( task_type=task.processor.TASK_TYPE, xlnet_model_name=args.xlnet_model, xlnet_load_mode=args.xlnet_load_mode, xlnet_load_args=args.xlnet_load_args, all_state=all_state, num_labels=len(task.processor.get_labels()), device=device, n_gpu=n_gpu, fp16=args.fp16, local_rank=args.local_rank, xlnet_config_json_path=args.xlnet_config_json_path, ) if args.do_train: if args.print_trainable_params: log_info.print_trainable_params(model) train_examples = task.get_train_examples() if args.train_examples_number is not None: train_examples = random_sample(train_examples, args.train_examples_number) t_total = shared_model_setup.get_opt_train_steps( num_train_examples=len(train_examples), args=args, ) optimizer = shared_model_setup.create_optimizer( model=model, learning_rate=args.learning_rate, t_total=t_total, loss_scale=args.loss_scale, fp16=args.fp16, warmup_proportion=args.warmup_proportion, state_dict=all_state["optimizer"] if args.bert_load_mode == "state_all" else None, ) else: train_examples = None t_total = 0 optimizer = None # TODO: what does xlnet runner do???\ # initial answer: probably do runner = GlueTaskRunner( model=model, optimizer=optimizer, tokenizer=tokenizer, label_list=task.get_labels(), device=device, rparams=RunnerParameters( max_seq_length=args.max_seq_length, local_rank=args.local_rank, n_gpu=n_gpu, fp16=args.fp16, learning_rate=args.learning_rate, gradient_accumulation_steps=args.gradient_accumulation_steps, t_total=t_total, warmup_proportion=args.warmup_proportion, num_train_epochs=args.num_train_epochs, train_batch_size=args.train_batch_size, eval_batch_size=args.eval_batch_size, )) if args.do_train: assert at_most_one_of([args.do_val_history, args.train_save_every]) if args.do_val_history: val_examples = task.get_dev_examples() results = runner.run_train_val( train_examples=train_examples, val_examples=val_examples, task_name=task.name, ) metrics_str = json.dumps(results, indent=2) with open( os.path.join(args.output_dir, "val_metrics_history.json"), "w") as f: f.write(metrics_str) elif args.train_save_every: train_dataloader = runner.get_train_dataloader( train_examples, verbose=not args.not_verbose) for epoch in range(int(args.num_train_epochs)): for step, _, _ in runner.run_train_epoch_context( train_dataloader): if step % args.train_save_every == args.train_save_every - 1 \ or step == len(train_dataloader) - 1: glue_model_setup.save_xlnet( model=model, optimizer=optimizer, args=args, save_path=os.path.join( args.output_dir, f"all_state___epoch{epoch:04d}___batch{step:06d}.p" ), save_mode=args.bert_save_mode, verbose=not args.not_verbose, ) else: runner.run_train(train_examples) if args.do_save: # Save a trained model glue_model_setup.save_xlnet( model=model, optimizer=optimizer, args=args, save_path=os.path.join(args.output_dir, "all_state.p"), save_mode=args.bert_save_mode, ) # remove the hack part for MultiNLI Mismatched dataset if args.do_val: val_examples = task.get_dev_examples() results = runner.run_val(val_examples, task_name=task.name, verbose=not args.not_verbose) df = pd.DataFrame(results["logits"]) df.to_csv(os.path.join(args.output_dir, "val_preds.csv"), header=False, index=False) metrics_str = json.dumps( { "loss": results["loss"], "metrics": results["metrics"] }, indent=2) print(metrics_str) with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f: f.write(metrics_str) if args.do_test: test_examples = task.get_test_examples() logits = runner.run_test(test_examples, verbose=not args.not_verbose) df = pd.DataFrame(logits) df.to_csv(os.path.join(args.output_dir, "test_preds.csv"), header=False, index=False)
def main(): logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) args = get_args() log_info.print_args(args) device, n_gpu = initialization.init_cuda_from_args(args, logger=logger) initialization.init_seed(args, n_gpu=n_gpu, logger=logger) initialization.init_train_batch_size(args) initialization.init_output_dir(args) initialization.save_args(args) tokenizer = shared_model_setup.create_tokenizer( bert_model_name=args.bert_model, bert_load_mode=args.bert_load_mode, do_lower_case=args.do_lower_case, bert_vocab_path=args.bert_vocab_path, ) all_state = shared_model_setup.load_overall_state(args.bert_load_path, relaxed=True) model = lm_model_setup.create_model( bert_model_name=args.bert_model, bert_load_mode=args.bert_load_mode, bert_load_args=args.bert_load_args, all_state=all_state, device=device, n_gpu=n_gpu, fp16=args.fp16, local_rank=args.local_rank, bert_config_json_path=args.bert_config_json_path, ) if args.print_trainable_params: log_info.print_trainable_params(model) train_dataset = lm_runners.LMDataset( args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory, ) t_total = shared_model_setup.get_opt_train_steps( num_train_examples=len(train_dataset), args=args, ) optimizer = shared_model_setup.create_optimizer( model=model, learning_rate=args.learning_rate, t_total=t_total, loss_scale=args.loss_scale, fp16=args.fp16, warmup_proportion=args.warmup_proportion, state_dict=all_state["optimizer"] if args.bert_load_mode == "state_all" else None, ) runner = lm_runners.LMRunner( model=model, optimizer=optimizer, tokenizer=tokenizer, device=device, rparams=lm_runners.RunnerParameters( select_prob=args.select_prob, max_seq_length=args.max_seq_length, local_rank=args.local_rank, n_gpu=n_gpu, fp16=args.fp16, learning_rate=args.learning_rate, gradient_accumulation_steps=args.gradient_accumulation_steps, t_total=t_total, warmup_proportion=args.warmup_proportion, num_train_epochs=args.num_train_epochs, train_batch_size=args.train_batch_size, ) ) runner.run_train(train_dataset) lm_model_setup.save_bert( model=model, optimizer=optimizer, args=args, save_path=os.path.join(args.output_dir, "all_state.p"), save_mode=args.bert_save_mode, )
def main(): logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) args = get_args() log_info.print_args(args) device, n_gpu = initialization.init_cuda_from_args(args, logger=logger) initialization.init_seed(args, n_gpu=n_gpu, logger=logger) initialization.init_train_batch_size(args) initialization.init_output_dir(args) initialization.save_args(args) task = get_task(args.task_name, args.data_dir) tokenizer = shared_model_setup.create_tokenizer( bert_model_name=args.bert_model, bert_load_mode=args.bert_load_mode, do_lower_case=args.do_lower_case, bert_vocab_path=args.bert_vocab_path, ) all_state = shared_model_setup.load_overall_state(args.bert_load_path, relaxed=True) model = glue_model_setup.create_model( task_type=task.processor.TASK_TYPE, bert_model_name=args.bert_model, bert_load_mode=args.bert_load_mode, bert_load_args=args.bert_load_args, all_state=all_state, num_labels=len(task.processor.get_labels()), device=device, n_gpu=n_gpu, fp16=args.fp16, local_rank=args.local_rank, bert_config_json_path=args.bert_config_json_path, ) if args.do_train: if args.print_trainable_params: log_info.print_trainable_params(model) train_examples = task.get_train_examples() t_total = shared_model_setup.get_opt_train_steps( num_train_examples=len(train_examples), args=args, ) optimizer = shared_model_setup.create_optimizer( model=model, learning_rate=args.learning_rate, t_total=t_total, loss_scale=args.loss_scale, fp16=args.fp16, warmup_proportion=args.warmup_proportion, state_dict=all_state["optimizer"] if args.bert_load_mode == "state_all" else None, ) else: train_examples = None t_total = 0 optimizer = None runner = GlueTaskRunner( model=model, optimizer=optimizer, tokenizer=tokenizer, label_list=task.get_labels(), device=device, rparams=RunnerParameters( max_seq_length=args.max_seq_length, local_rank=args.local_rank, n_gpu=n_gpu, fp16=args.fp16, learning_rate=args.learning_rate, gradient_accumulation_steps=args.gradient_accumulation_steps, t_total=t_total, warmup_proportion=args.warmup_proportion, num_train_epochs=args.num_train_epochs, train_batch_size=args.train_batch_size, eval_batch_size=args.eval_batch_size, ) ) if args.do_train: assert at_most_one_of([args.do_val_history, args.train_save_every]) if args.do_val_history: val_examples = task.get_dev_examples() results = runner.run_train_val( train_examples=train_examples, val_examples=val_examples, task_name=task.name, ) metrics_str = json.dumps(results, indent=2) with open(os.path.join(args.output_dir, "val_metrics_history.json"), "w") as f: f.write(metrics_str) elif args.train_save_every: train_dataloader = runner.get_train_dataloader(train_examples, verbose=not args.not_verbose) for epoch in range(int(args.num_train_epochs)): for step, _, _ in runner.run_train_epoch_context(train_dataloader): if step % args.train_save_every == args.train_save_every - 1 \ or step == len(train_dataloader) - 1: glue_model_setup.save_bert( model=model, optimizer=optimizer, args=args, save_path=os.path.join( args.output_dir, f"all_state___epoch{epoch:04d}___batch{step:06d}.p" ), save_mode=args.bert_save_mode, verbose=not args.not_verbose, ) else: runner.run_train(train_examples) if args.do_save: # Save a trained model glue_model_setup.save_bert( model=model, optimizer=optimizer, args=args, save_path=os.path.join(args.output_dir, "all_state.p"), save_mode=args.bert_save_mode, ) if args.do_val: val_examples = task.get_dev_examples() results = runner.run_val(val_examples, task_name=task.name, verbose=not args.not_verbose) df = pd.DataFrame(results["logits"]) df.to_csv(os.path.join(args.output_dir, "val_preds.csv"), header=False, index=False) metrics_str = json.dumps({"loss": results["loss"], "metrics": results["metrics"]}, indent=2) print(metrics_str) with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f: f.write(metrics_str) # HACK for MNLI-mismatched if task.name == "mnli": mm_val_examples = MnliMismatchedProcessor().get_dev_examples(task.data_dir) mm_results = runner.run_val(mm_val_examples, task_name=task.name, verbose=not args.not_verbose) df = pd.DataFrame(results["logits"]) df.to_csv(os.path.join(args.output_dir, "mm_val_preds.csv"), header=False, index=False) combined_metrics = {} for k, v in results["metrics"]: combined_metrics[k] = v for k, v in mm_results["metrics"]: combined_metrics["mm-"+k] = v combined_metrics_str = json.dumps({ "loss": results["loss"], "metrics": combined_metrics, }, indent=2) with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f: f.write(combined_metrics_str) if args.do_test: test_examples = task.get_test_examples() logits = runner.run_test(test_examples, verbose=not args.not_verbose) df = pd.DataFrame(logits) df.to_csv(os.path.join(args.output_dir, "test_preds.csv"), header=False, index=False) # HACK for MNLI-mismatched if task.name == "mnli": test_examples = MnliMismatchedProcessor().get_test_examples(task.data_dir) logits = runner.run_test(test_examples) df = pd.DataFrame(logits) df.to_csv(os.path.join(args.output_dir, "mm_test_preds.csv"), header=False, index=False)
def main(): logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) args = get_args() log_info.print_args(args) device, n_gpu = initialization.init_cuda_from_args(args, logger=logger) initialization.init_seed(args, n_gpu=n_gpu, logger=logger) initialization.init_train_batch_size(args) initialization.init_output_dir(args) initialization.save_args(args) task = get_task(args.task_name, args.data_dir) tokenizer = AutoTokenizer.from_pretrained(args.bert_all_dir) classification_lm_model = ssl_reg_model_setup.MyBertClassificationLM( bert_load_path=args.bert_all_dir, num_labels=len(task.processor.get_labels())) if args.do_train: if args.print_trainable_params: print("TRAINABLE PARAMS:") print(" SHARED:") for param_name, param in classification_lm_model.classification_model.roberta.named_parameters( ): if param.requires_grad: print(" {} {}".format(param_name, tuple(param.shape))) print(" CLASSIFICATION:") for param_name, param in classification_lm_model.classification_model.named_parameters( ): if param.requires_grad and not param_name.startswith( "roberta."): print(" {} {}".format(param_name, tuple(param.shape))) print(" LM:") for param_name, param in classification_lm_model.lm_model.named_parameters( ): if param.requires_grad and not param_name.startswith( "roberta."): print(" {} {}".format(param_name, tuple(param.shape))) train_examples = task.get_train_examples() t_total = shared_model_setup.get_opt_train_steps( num_train_examples=len(train_examples), args=args, ) parameters = list( classification_lm_model.classification_model.named_parameters() ) + list(classification_lm_model.lm_model.lm_head.named_parameters()) no_decay = ["bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in parameters if not any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, { "params": [p for n, p in parameters if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_parameters, lr=args.learning_rate, betas=(args.adam_beta1, args.adam_beta2), eps=1e-6, weight_decay=0.1) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_proportion * t_total, num_training_steps=t_total) else: train_examples = None t_total = 0 optimizer = None runner = ClassificationLMTaskRunner( classification_lm_model=classification_lm_model, optimizer=optimizer, clip_grad_norm=args.clip_grad_norm, scheduler=scheduler, tokenizer=tokenizer, label_list=task.get_labels(), device=device, rparams=RunnerParameters( max_seq_length=args.max_seq_length, classification_loss_weight=args.classification_loss_weight, train_lm_loss_weight=args.train_lm_loss_weight, learning_rate=args.learning_rate, gradient_accumulation_steps=args.gradient_accumulation_steps, t_total=t_total, warmup_proportion=args.warmup_proportion, num_train_epochs=args.num_train_epochs, train_batch_size=args.train_batch_size, eval_batch_size=args.eval_batch_size, ), output_path=args.output_dir) if args.do_train: if args.do_val_history: # for GLUE datasets, we do not have test set labels, it could only be evaluated by submitting to GLUE server. if not args.has_test_label: val_examples = task.get_dev_examples() results = runner.run_train_val( train_examples=train_examples, val_examples=val_examples, task_name=task.name, ) metrics_str = json.dumps(results, indent=2) with open( os.path.join(args.output_dir, "val_metrics_history.json"), "w") as f: f.write(metrics_str) else: val_examples = task.get_dev_examples() test_examples = task.get_test_examples() results_val, results_test = runner.run_train_val_test( train_examples=train_examples, val_examples=val_examples, test_examples=test_examples, task_name=task.name, save_best_model=args.save_best_model, ) metrics_str = json.dumps(results_val, indent=2) with open( os.path.join(args.output_dir, "val_metrics_history.json"), "w") as f: f.write(metrics_str) metrics_str = json.dumps(results_test, indent=2) with open( os.path.join(args.output_dir, "test_metrics_history.json"), "w") as f: f.write(metrics_str) else: runner.run_train(train_examples, task_name=task.name) if args.do_save: if not args.save_best_model: # Save a trained model at the last epoch. ssl_reg_model_setup.save_bert( classification_lm_model=classification_lm_model, optimizer=optimizer, args=args, save_path=os.path.join(args.output_dir, "all_state.p"), save_mode=args.bert_save_mode, ) if args.do_val: val_examples = task.get_dev_examples() runner.load_best_model(os.path.join(args.output_dir, "all_state.p")) results = runner.run_evaluate_with_label(val_examples, task_name=task.name, verbose=not args.not_verbose) df = pd.DataFrame(results["logits"]) df.to_csv(os.path.join(args.output_dir, "val_preds.csv"), header=False, index=False) metrics_str = json.dumps( { "loss": results["loss"], "metrics": results["metrics"] }, indent=2) print(metrics_str) with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f: f.write(metrics_str) if args.do_test: test_examples = task.get_test_examples() runner.load_best_model(os.path.join(args.output_dir, "all_state.p")) results = runner.run_evaluate_with_label(test_examples, task_name=task.name, verbose=not args.not_verbose) df = pd.DataFrame(results["logits"]) df.to_csv(os.path.join(args.output_dir, "test_preds.csv"), header=False, index=False) metrics_str = json.dumps( { "loss": results["loss"], "metrics": results["metrics"] }, indent=2) print(metrics_str) with open(os.path.join(args.output_dir, "test_metrics.json"), "w") as f: f.write(metrics_str)