def main(): logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) args = get_args() log_info.print_args(args) device, n_gpu = initialization.init_cuda_from_args(args, logger=logger) initialization.init_seed(args, n_gpu=n_gpu, logger=logger) initialization.init_train_batch_size(args) initialization.init_output_dir(args) initialization.save_args(args) task = get_task(args.task_name, args.data_dir) # prepare examples, load model as encoder tokenizer = shared_model_setup.create_tokenizer( bert_model_name=args.bert_model, bert_load_mode=args.bert_load_mode, do_lower_case=args.do_lower_case, bert_vocab_path=args.bert_vocab_path, ) all_state = shared_model_setup.load_overall_state(args.bert_load_path, relaxed=True) # Load Model... if args.bert_load_mode == "state_model_only": state_dict = all_state['model'] bert_as_encoder = BertModel.from_state_dict( config_file=args.bert_config_json_path, state_dict=state_dict) else: assert args.bert_load_mode == "from_pretrained" cache_dir = PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format( args.local_rank) bert_as_encoder = BertModel.from_pretrained( pretrained_model_name_or_path=args.bert_model, cache_dir=cache_dir) bert_as_encoder.to(device) runner_param = RunnerParameters( max_seq_length=args.max_seq_length, local_rank=args.local_rank, n_gpu=n_gpu, fp16=args.fp16, learning_rate=args.learning_rate, gradient_accumulation_steps=args.gradient_accumulation_steps, t_total=None, warmup_proportion=args.warmup_proportion, num_train_epochs=args.num_train_epochs, train_batch_size=args.train_batch_size, eval_batch_size=args.eval_batch_size, ) runner = EmbeddingTaskRunner(bert_model=bert_as_encoder, optimizer=None, tokenizer=tokenizer, label_list=task.get_labels(), device=device, rparams=runner_param) # Run training set encoding... print("Run training set encoding ... ") train_examples = task.get_train_examples() train_dataset = runner.run_encoding(train_examples, verbose=True, mode='train') print("saving embeddings ... ") torch.save(train_dataset, os.path.join(args.output_dir, "train.dataset")) # Run development set encoding ... eval_examples = task.get_dev_examples() eval_dataset = runner.run_encoding(eval_examples, verbose=True, mode='eval') print("saving embeddings ... ") torch.save(eval_dataset, os.path.join(args.output_dir, 'dev.dataset')) # Run test set encoding ... test_examples = task.get_test_examples() test_dataset = runner.run_encoding(test_examples, verbose=True, mode='test') print("saving embeddings ... ") torch.save(test_dataset, os.path.join(args.output_dir, "test.dataset")) # HACK for MNLI mis-matched set ... if args.task_name == 'mnli': print("=== Start embedding task for MNLI mis-matched ===") mm_eval_examples = MnliMismatchedProcessor().get_dev_examples( task.data_dir) mm_eval_dataset = runner.run_encoding(mm_eval_examples, verbose=True, mode='eval') print("=== Saving eval dataset ===") torch.save(mm_eval_dataset, os.path.join(args.output_dir, "mm_dev.dataset")) print("=== Saved ===") mm_test_examples = MnliMismatchedProcessor().get_test_examples( task.data_dir) mm_test_dataset = runner.run_encoding(mm_test_examples, verbose=True, mode='test') print("=== Saving tensor dataset ===") torch.save(mm_test_dataset, os.path.join(args.output_dir, "mm_test.dataset")) print("=== Saved ===")
def prepare_train_data(args): print('Preparing net training data...') tokenizer = AutoTokenizer.from_pretrained(args.model) net_task = get_task(args.task_name, args.dataroot) net_examples = net_task.get_train_examples() net_label_list = net_task.get_labels() net_label_map = {label: i for i, label in enumerate(net_label_list)} net_input_ids = [] net_input_masks = [] net_segment_ids = [] net_label_ids = [] for (ex_index, example) in enumerate(net_examples): net_input_id, net_input_mask, net_segment_id, net_label_id = \ convert_example_to_feature(example, tokenizer, args.max_seq_length, net_label_map) net_input_ids.append(net_input_id) net_input_masks.append(net_input_mask) net_segment_ids.append(net_segment_id) net_label_ids.append(net_label_id) net_input_ids = torch.tensor(net_input_ids) net_input_masks = torch.tensor(net_input_masks) net_segment_ids = torch.tensor(net_segment_ids) net_label_ids = torch.tensor(net_label_ids) print('Preparing ssh training data...') if args.auxiliary_labels == 2: ssh_task = get_task('aug-2', args.aug_dataroot) elif args.auxiliary_labels == 3: ssh_task = get_task('aug-3', args.aug_dataroot) else: ssh_task = get_task('aug-4', args.aug_dataroot) ssh_examples = ssh_task.get_train_examples() ssh_label_list = ssh_task.get_labels() ssh_label_map = {label: i for i, label in enumerate(ssh_label_list)} ssh_input_ids = [] ssh_input_masks = [] ssh_segment_ids = [] ssh_label_ids = [] for (ex_index, example) in enumerate(ssh_examples): ssh_input_id, ssh_input_mask, ssh_segment_id, ssh_label_id = \ convert_example_to_feature(example, tokenizer, args.max_seq_length, ssh_label_map) ssh_input_ids.append(ssh_input_id) ssh_input_masks.append(ssh_input_mask) ssh_segment_ids.append(ssh_segment_id) ssh_label_ids.append(ssh_label_id) ssh_input_ids = torch.tensor(ssh_input_ids[:len(net_input_ids)]) ssh_input_masks = torch.tensor(ssh_input_masks[:len(net_input_masks)]) ssh_segment_ids = torch.tensor(ssh_segment_ids[:len(net_segment_ids)]) ssh_label_ids = torch.tensor(ssh_label_ids[:len(net_label_ids)]) trset = torch.utils.data.TensorDataset(net_input_ids, net_input_masks, net_segment_ids, net_label_ids, ssh_input_ids, ssh_input_masks, ssh_segment_ids, ssh_label_ids) trset_sampler = torch.utils.data.RandomSampler(trset) trloader = torch.utils.data.DataLoader(trset, batch_size=args.batch_size, sampler=trset_sampler, num_workers=0) return trloader
def main(): logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) args = get_args() log_info.print_args(args) device, n_gpu = initialization.init_cuda_from_args(args, logger=logger) initialization.init_seed(args, n_gpu=n_gpu, logger=logger) initialization.init_train_batch_size(args) initialization.init_output_dir(args) # could cause problem because imdb is not part of the tasks defined task = get_task(args.task_name, args.data_dir) # create tokenizer using given model input # I think xlnet also use the same tokenizer tokenizer = shared_model_setup.create_tokenizer( xlnet_model_name=args.xlnet_model, # need to change xlnet_load_mode=args.xlnet_load_mode, # need to change do_lower_case=args.do_lower_case, xlnet_vocab_path=args.xlnet_vocab_path, # not sure how to modify ) all_state = shared_model_setup.load_overall_state( args.xlnet_load_path, relaxed=True) # probably will be the pre-trained one model = glue_model_setup.create_model( task_type=task.processor.TASK_TYPE, xlnet_model_name=args.xlnet_model, xlnet_load_mode=args.xlnet_load_mode, xlnet_load_args=args.xlnet_load_args, all_state=all_state, num_labels=len(task.processor.get_labels()), device=device, n_gpu=n_gpu, fp16=args.fp16, local_rank=args.local_rank, xlnet_config_json_path=args.xlnet_config_json_path, ) if args.do_train: if args.print_trainable_params: log_info.print_trainable_params(model) train_examples = task.get_train_examples() if args.train_examples_number is not None: train_examples = random_sample(train_examples, args.train_examples_number) t_total = shared_model_setup.get_opt_train_steps( num_train_examples=len(train_examples), args=args, ) optimizer = shared_model_setup.create_optimizer( model=model, learning_rate=args.learning_rate, t_total=t_total, loss_scale=args.loss_scale, fp16=args.fp16, warmup_proportion=args.warmup_proportion, state_dict=all_state["optimizer"] if args.bert_load_mode == "state_all" else None, ) else: train_examples = None t_total = 0 optimizer = None # TODO: what does xlnet runner do???\ # initial answer: probably do runner = GlueTaskRunner( model=model, optimizer=optimizer, tokenizer=tokenizer, label_list=task.get_labels(), device=device, rparams=RunnerParameters( max_seq_length=args.max_seq_length, local_rank=args.local_rank, n_gpu=n_gpu, fp16=args.fp16, learning_rate=args.learning_rate, gradient_accumulation_steps=args.gradient_accumulation_steps, t_total=t_total, warmup_proportion=args.warmup_proportion, num_train_epochs=args.num_train_epochs, train_batch_size=args.train_batch_size, eval_batch_size=args.eval_batch_size, )) if args.do_train: assert at_most_one_of([args.do_val_history, args.train_save_every]) if args.do_val_history: val_examples = task.get_dev_examples() results = runner.run_train_val( train_examples=train_examples, val_examples=val_examples, task_name=task.name, ) metrics_str = json.dumps(results, indent=2) with open( os.path.join(args.output_dir, "val_metrics_history.json"), "w") as f: f.write(metrics_str) elif args.train_save_every: train_dataloader = runner.get_train_dataloader( train_examples, verbose=not args.not_verbose) for epoch in range(int(args.num_train_epochs)): for step, _, _ in runner.run_train_epoch_context( train_dataloader): if step % args.train_save_every == args.train_save_every - 1 \ or step == len(train_dataloader) - 1: glue_model_setup.save_xlnet( model=model, optimizer=optimizer, args=args, save_path=os.path.join( args.output_dir, f"all_state___epoch{epoch:04d}___batch{step:06d}.p" ), save_mode=args.bert_save_mode, verbose=not args.not_verbose, ) else: runner.run_train(train_examples) if args.do_save: # Save a trained model glue_model_setup.save_xlnet( model=model, optimizer=optimizer, args=args, save_path=os.path.join(args.output_dir, "all_state.p"), save_mode=args.bert_save_mode, ) # remove the hack part for MultiNLI Mismatched dataset if args.do_val: val_examples = task.get_dev_examples() results = runner.run_val(val_examples, task_name=task.name, verbose=not args.not_verbose) df = pd.DataFrame(results["logits"]) df.to_csv(os.path.join(args.output_dir, "val_preds.csv"), header=False, index=False) metrics_str = json.dumps( { "loss": results["loss"], "metrics": results["metrics"] }, indent=2) print(metrics_str) with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f: f.write(metrics_str) if args.do_test: test_examples = task.get_test_examples() logits = runner.run_test(test_examples, verbose=not args.not_verbose) df = pd.DataFrame(logits) df.to_csv(os.path.join(args.output_dir, "test_preds.csv"), header=False, index=False)
def main(): logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) args = get_args() log_info.print_args(args) device, n_gpu = initialization.init_cuda_from_args(args, logger=logger) initialization.init_seed(args, n_gpu=n_gpu, logger=logger) initialization.init_train_batch_size(args) initialization.init_output_dir(args) initialization.save_args(args) task = get_task(args.task_name, args.data_dir) tokenizer = shared_model_setup.create_tokenizer( bert_model_name=args.bert_model, bert_load_mode=args.bert_load_mode, do_lower_case=args.do_lower_case, bert_vocab_path=args.bert_vocab_path, ) all_state = shared_model_setup.load_overall_state(args.bert_load_path, relaxed=True) model = glue_model_setup.create_model( task_type=task.processor.TASK_TYPE, bert_model_name=args.bert_model, bert_load_mode=args.bert_load_mode, bert_load_args=args.bert_load_args, all_state=all_state, num_labels=len(task.processor.get_labels()), device=device, n_gpu=n_gpu, fp16=args.fp16, local_rank=args.local_rank, bert_config_json_path=args.bert_config_json_path, ) if args.do_train: if args.print_trainable_params: log_info.print_trainable_params(model) train_examples = task.get_train_examples() t_total = shared_model_setup.get_opt_train_steps( num_train_examples=len(train_examples), args=args, ) optimizer = shared_model_setup.create_optimizer( model=model, learning_rate=args.learning_rate, t_total=t_total, loss_scale=args.loss_scale, fp16=args.fp16, warmup_proportion=args.warmup_proportion, state_dict=all_state["optimizer"] if args.bert_load_mode == "state_all" else None, ) else: train_examples = None t_total = 0 optimizer = None runner = GlueTaskRunner( model=model, optimizer=optimizer, tokenizer=tokenizer, label_list=task.get_labels(), device=device, rparams=RunnerParameters( max_seq_length=args.max_seq_length, local_rank=args.local_rank, n_gpu=n_gpu, fp16=args.fp16, learning_rate=args.learning_rate, gradient_accumulation_steps=args.gradient_accumulation_steps, t_total=t_total, warmup_proportion=args.warmup_proportion, num_train_epochs=args.num_train_epochs, train_batch_size=args.train_batch_size, eval_batch_size=args.eval_batch_size, ) ) if args.do_train: assert at_most_one_of([args.do_val_history, args.train_save_every]) if args.do_val_history: val_examples = task.get_dev_examples() results = runner.run_train_val( train_examples=train_examples, val_examples=val_examples, task_name=task.name, ) metrics_str = json.dumps(results, indent=2) with open(os.path.join(args.output_dir, "val_metrics_history.json"), "w") as f: f.write(metrics_str) elif args.train_save_every: train_dataloader = runner.get_train_dataloader(train_examples, verbose=not args.not_verbose) for epoch in range(int(args.num_train_epochs)): for step, _, _ in runner.run_train_epoch_context(train_dataloader): if step % args.train_save_every == args.train_save_every - 1 \ or step == len(train_dataloader) - 1: glue_model_setup.save_bert( model=model, optimizer=optimizer, args=args, save_path=os.path.join( args.output_dir, f"all_state___epoch{epoch:04d}___batch{step:06d}.p" ), save_mode=args.bert_save_mode, verbose=not args.not_verbose, ) else: runner.run_train(train_examples) if args.do_save: # Save a trained model glue_model_setup.save_bert( model=model, optimizer=optimizer, args=args, save_path=os.path.join(args.output_dir, "all_state.p"), save_mode=args.bert_save_mode, ) if args.do_val: val_examples = task.get_dev_examples() results = runner.run_val(val_examples, task_name=task.name, verbose=not args.not_verbose) df = pd.DataFrame(results["logits"]) df.to_csv(os.path.join(args.output_dir, "val_preds.csv"), header=False, index=False) metrics_str = json.dumps({"loss": results["loss"], "metrics": results["metrics"]}, indent=2) print(metrics_str) with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f: f.write(metrics_str) # HACK for MNLI-mismatched if task.name == "mnli": mm_val_examples = MnliMismatchedProcessor().get_dev_examples(task.data_dir) mm_results = runner.run_val(mm_val_examples, task_name=task.name, verbose=not args.not_verbose) df = pd.DataFrame(results["logits"]) df.to_csv(os.path.join(args.output_dir, "mm_val_preds.csv"), header=False, index=False) combined_metrics = {} for k, v in results["metrics"]: combined_metrics[k] = v for k, v in mm_results["metrics"]: combined_metrics["mm-"+k] = v combined_metrics_str = json.dumps({ "loss": results["loss"], "metrics": combined_metrics, }, indent=2) with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f: f.write(combined_metrics_str) if args.do_test: test_examples = task.get_test_examples() logits = runner.run_test(test_examples, verbose=not args.not_verbose) df = pd.DataFrame(logits) df.to_csv(os.path.join(args.output_dir, "test_preds.csv"), header=False, index=False) # HACK for MNLI-mismatched if task.name == "mnli": test_examples = MnliMismatchedProcessor().get_test_examples(task.data_dir) logits = runner.run_test(test_examples) df = pd.DataFrame(logits) df.to_csv(os.path.join(args.output_dir, "mm_test_preds.csv"), header=False, index=False)
eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=self.rparams.eval_batch_size, ) return HybridLoaderSeparated(eval_dataloader, eval_tokens_a, eval_tokens_b) if __name__ == "__main__": from glue.tasks import get_task from shared import model_setup as shared_model_setup from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE from pytorch_pretrained_bert.modeling import BertModel task = get_task("wnli", "../../jiant_data/WNLI") train_examples = task.get_train_examples() label_map = {k: v for v, k in enumerate(task.get_labels())} tokenizer = shared_model_setup.create_tokenizer( bert_model_name="bert-base-uncased", bert_load_mode="from_pretrained", do_lower_case=True, ) bert_vocab_path = "../cache/bert_metadata/uncased_L-12_H-768_A-12/vocab.txt" train_features = convert_examples_to_features_separated( train_examples, label_map=label_map, max_seq_length=100, tokenizer=tokenizer, verbose=True) train_data, train_tokens_a, train_tokens_b = convert_to_dataset_separated(
parser.add_argument('--alpha', default=0.1, type=float) parser.add_argument('--num_aug', default=1, type=int) parser.add_argument('--num_type', default=4, type=int) parser.add_argument('--task_name', default='CoLA') parser.add_argument('--dataroot', default='./glue_data/') parser.add_argument('--aug_dataroot', default='./aug_data/') args = parser.parse_args() alpha = args.alpha num_aug = args.num_aug num_type = args.num_type task_name = args.task_name task_dir = os.path.join(args.dataroot, task_name) task = get_task(task_name.lower(), task_dir) output_dir = os.path.join(args.aug_dataroot, task_name) try: os.makedirs(output_dir) except OSError: pass ori_train_df = task.get_train_df() ori_dev_df = task.get_dev_df() aug_train_df = pd.DataFrame(columns=["sentence", "label"]) print("Trainning dataset preview:") print("train sentences num:", len(ori_train_df)) print("Original:", ori_train_df.head())