def get_arguments(): # Initialize the parser parser = argparse.ArgumentParser( description='Input arguments for which dataset to select') ## Program Arguments parser.add_argument('--seed', type=int, help='Seed value. Default=0', default=0) parser.add_argument('--task_name', choices=['qnli', 'sst2', 'mrpc'], help='Which dataset to select. Default=qnli', default='sst2') parser.add_argument('--model', choices=['bart', 'xlnet'], help='Which model to select. Default=bart', default='xlnet') parser.add_argument( '--out_dir', help='Output dir where to save the dataset. Default=./out_dir', default='./out_dir') ## Parse the arguments args = parser.parse_args() args.out_dir = eutils.mkdir_p(args.out_dir) return args
def compute_dev_s_scores(): ## TODO: Write the values to the dataframe dynamically instead of at the very end key_name = 'dev_s' model_names, task_names, ngrams_list, seeds_list = get_settings_list() args = get_arguments() args.inp_dir = eutils.mkdir_p(f'./out_dir/{key_name}') avg_scores = [] for outer_idx, (task, model, ngram) in enumerate(itertools.product(task_names, model_names, ngrams_list)): print(f'\nKey_name: {key_name}, Model: {model}, task: {task}, ngram: {ngram}\n') args.task_name = task args.model = model args.ngram = ngram tmp_res = [] for inner_idx, seed in enumerate(seeds_list): args.seed = seed ## Set the seed tmp_res.append(get_model_scores(args)) dict_of_list = merge_with(list, *tmp_res) dict_of_avg_val = {key: statistics.mean(val) for key, val in dict_of_list.items()} avg_scores.append({'model': model, 'task_name': task, 'ngram': ngram, 'word_order_sensitivity':word_order_sensitivity(dict_of_avg_val['accuracy']), **dict_of_avg_val}) df = pd.DataFrame(avg_scores) file_name = args.out_dir / f'{key_name}_scores.csv' df.to_csv(file_name, index=False)
def main(args): ## Debugging Mode if sys.gettrace() is not None: print('In debug mode') args.profiler = 'simple' args.num_workers = 0 args.pin_memory = False args.default_root_dir = mkdir_p(abs_path('./tmp_dir')) args.stochastic_weight_avg = True args.limit_train_batches = 0.001 args.limit_val_batches = 0.001 args.num_sanity_val_steps = 0 args.terminate_on_nan = True args.deterministic = True args.auto_select_gpus = False args.fast_dev_run = False # Quick Check Working args.progress_bar_refresh_rate = 0 args.gpus = 1 args.precision = 16 args.train_batch_size = 32 # args.val_batch_size=4 args.freeze_encoder = True args.verbose = True args.max_epochs = 2 ## Model dict_vars = vars(args) model = BertForRegression(**dict_vars) print(f'Model Init Done') ## TODO: Early Stopping Callback ## Callbacks checkpoint_callback = pl.callbacks.ModelCheckpoint( filename='{epoch:02d}-{val_loss:.5f}-{step_count}', dirpath=None, prefix="best_model_checkpoint", monitor="val_loss", mode="min", save_top_k=args.save_top_k, ) # lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='step') trainer = pl.Trainer.from_argparse_args( args, callbacks=[LoggingCallback(), checkpoint_callback], ) trainer.fit(model) best_model_path = trainer.checkpoint_callback.best_model_path print(f'\nBEST MODEL PATH IS {best_model_path}')
def compute_dev_r_scores(): key_name = 'dev_r' model_names, task_names, ngrams_list, seeds_list = get_settings_list() args = get_arguments() args.inp_dir = eutils.mkdir_p(f'./out_dir/{key_name}') avg_scores = [] for outer_idx, (task, model) in enumerate(itertools.product(task_names, model_names)): print(f'\nKey_name: {key_name}, Model: {model}, task: {task}\n') args.task_name = task args.model = model tmp_res = get_model_scores(args) avg_scores.append({'model': model, 'task_name': task, **tmp_res}) df = pd.DataFrame(avg_scores) file_name = args.out_dir / f'{key_name}_scores.csv' df.to_csv(file_name, index=False)
def save_dataset(dataset, args): ## Save as CSV file (dev-r) out_dir = eutils.mkdir_p(args.out_dir.joinpath('dev_r')) file_name = os.path.join( out_dir, f'model_{args.model}_task_name_{args.task_name}.csv') dataset.to_csv(path_or_buf=file_name, index=False)
def load_dataset_from_csv_with_key(args, data_key): if data_key == 'dev_r': return load_dataset_from_csv(args) elif data_key == 'dev_s': args_copy = copy.deepcopy(args) args_copy.inp_dir = args_copy.inp_dir / f'seed_{args.seed}' / f'ngram_{args.ngram}' return load_dataset_from_csv(args_copy) else: print(f'Not yet implemented') sys.exit(0) def get_model_scores(args): dataset = load_dataset_from_csv_with_key(args, args.inp_dir.stem) eval_results, model_tuple = eutils.get_model_scores(dataset, args) return { 'accuracy': eval_results['eval_accuracy'], 'avg_confidence_score': eval_results['eval_avg_confidence_score'] } if __name__ == '__main__': args = get_arguments() if sys.gettrace() is not None: args.inp_dir = eutils.mkdir_p('./out_dir/dev_s') args.out_dir = args.inp_dir aa = get_model_scores(args) print(f'Done')
def save_dataset(dataset, args): ## Save as CSV file (dev-s) out_dir = eutils.mkdir_p(args.out_dir.joinpath(f'dev_s/seed_{args.seed}/ngram_{args.ngram}/')) file_name = os.path.join(out_dir, f'model_{args.model}_task_name_{args.task_name}_seed_{args.seed}_ngram_{args.ngram}.csv') dataset.to_csv(path_or_buf=file_name, index=False)