def gen_training_commands(spec_dict): paths.validate_spec_dict(spec_dict, config.spec_dict_params) paths.validate_phase(spec_dict['task_phase'], config.task_phases) mem_alloc_gb, time_alloc_hrs, n_tasks, cpus_per_task = scripts.get_training_alloc( spec_dict['training_dataset']) header_commands = scripts.gen_command_header( mem_alloc_gb=mem_alloc_gb, time_alloc_hrs=time_alloc_hrs, n_tasks=n_tasks, cpus_per_task=cpus_per_task, two_gpus=(spec_dict['training_dataset'] in {'Providence', 'Providence-Age'})) slurm_commands = header_commands # get the directory where this should be saved model_output_spec_dict = copy.copy(spec_dict) model_output_spec_dict['task_phase'] = 'train' model_output_dir = paths.get_directory(model_output_spec_dict) if not os.path.exists(model_output_dir): os.makedirs(model_output_dir) slurm_commands += [ f"rm -rf {model_output_dir}\n" ] # clear the directory in case it had stuff in it before. f in case it doesn't exist slurm_commands += [f"mkdir -p {model_output_dir}\n" ] # make the training directory if necessary slurm_commands += ["mkdir ~/.cache/$SLURM_JOB_ID\n"] data_input_spec_dict = copy.copy(spec_dict) data_input_spec_dict['task_phase'] = 'extract_data' data_input_dir = paths.get_directory(data_input_spec_dict) sh_loc = 'output/SLURM/' + spec_dict['task_name'] + '_' + spec_dict[ 'task_phase'] if not os.path.exists(sh_loc): os.makedirs(sh_loc) # Construct the python/training-related commands slurm_commands.append( scripts.get_run_mlm_command(spec_dict['training_split'], spec_dict['training_dataset'], spec_dict['use_tags'], data_input_dir, model_output_dir, config.slurm_user)) slurm_filename = os.path.join(sh_loc, paths.get_slurm_script_name(spec_dict)) print(slurm_filename) return slurm_filename, slurm_commands
def gen_fitting_commands(fitting_spec_dict): paths.validate_spec_dict(fitting_spec_dict, config.spec_dict_params) paths.validate_phase(fitting_spec_dict['task_phase'], config.task_phases) mem_alloc_gb, time_alloc_hrs, n_tasks, cpus_per_task = scripts.get_training_alloc( fitting_spec_dict['training_dataset']) header_commands = scripts.gen_command_header(mem_alloc_gb=mem_alloc_gb, time_alloc_hrs=time_alloc_hrs, n_tasks=n_tasks, cpus_per_task=cpus_per_task, two_gpus=False) slurm_commands = header_commands fitting_output_path = paths.get_directory(fitting_spec_dict) if not exists(fitting_output_path): os.makedirs(fitting_output_path) slurm_commands += [ f"rm -rf {fitting_output_path}\n" ] # clear the directory in case it had stuff in it before slurm_commands += [f"mkdir -p {fitting_output_path}\n" ] # make the training directory if necessary slurm_commands += ["mkdir ~/.cache/$SLURM_JOB_ID\n"] model_input_spec_dict = copy.copy(fitting_spec_dict) model_input_spec_dict['task_phase'] = 'train' model_input_spec_dict['context_width'] = None model_input_spec_dict['test_split'] = None model_input_spec_dict['test_dataset'] = None model_input_dir = paths.get_directory(model_input_spec_dict) sh_loc = 'output/SLURM/' + fitting_spec_dict[ 'task_name'] + '_' + fitting_spec_dict['task_phase'] if not exists(sh_loc): os.makedirs(sh_loc) sing_header = scripts.gen_singularity_header() # slurm_commands += [f"{sing_header} {gen_eval_scripts.get_one_python_command('src/run/run_beta_search.py', fitting_spec_dict['test_split'], fitting_spec_dict['test_dataset'] , fitting_spec_dict['use_tags'], fitting_spec_dict['context_width'], fitting_spec_dict['model_type'], fitting_spec_dict['training_dataset'], fitting_spec_dict['training_split'])[1]}\n"] slurm_commands += [ f"{sing_header} {scripts.get_python_run_command('src/run/run_beta_search.py', fitting_spec_dict)}\n" ] slurm_filename = os.path.join( sh_loc, paths.get_slurm_script_name(fitting_spec_dict)) return slurm_filename, slurm_commands
def get_optimal_hyperparameter_value(this_model_dict, hyperparameter): ''' Get the best hyperparameter value for a given model x test dataset ''' fitted_model_dict = copy.copy(this_model_dict) fitted_model_dict['task_phase'] = 'fit' fitted_model_path = paths.get_directory(fitted_model_dict) if hyperparameter == 'beta': n_hyperparameter = config.n_beta elif hyperparameter in ['lambda', 'gamma']: n_hyperparameter = config.n_lambda this_hyperparameter_results = pd.read_csv( join(fitted_model_path, hyperparameter + f'_search_results_{n_hyperparameter}.csv')) # # Need to argmax for beta_value, given the posterior surprisal list_hyperparameter_results = list( this_hyperparameter_results[hyperparameter + '_value']) list_surp = list(this_hyperparameter_results['posterior_surprisal']) argmin_hyperparameter = np.argmin(list_surp) best_hyperparameter = list_hyperparameter_results[argmin_hyperparameter] return best_hyperparameter
def optimize_beta_and_lambda(fitting_dict): ''' Find the values of beta and lambda which minimize posterior surprisal; save this information in a place that run_models_across_time can load Args: fitting dict: a dictionary with keys for training_split, training_dataset, test_split, test_dataset, etc. See utils/paths.py for a full description Return: the best parameter values for WFST and Levenshtein distance likelihoods; saves the scores for each hyperparameter value as a side effect ''' beta_sample = hyperparameter_utils.get_hyperparameter_search_values('beta') lambda_sample = hyperparameter_utils.get_hyperparameter_search_values('lambda') gamma_sample = hyperparameter_utils.get_hyperparameter_search_values('lambda') # initial_vocab determines the softmax mask used by BERT, leave it as mask for all evaluations/training initial_vocab, cmu_in_initial_vocab, cmu_indices_for_initial_vocab = load_models.get_initial_vocab_info() fitting_path = paths.get_directory(fitting_dict) if not exists(fitting_path): os.makedirs(fitting_path) success_utts_sample_path = paths.get_sample_csv_path(task_phase_to_sample_for='fit', split=fitting_dict['test_split'], dataset=fitting_dict['test_dataset'], data_type='success', age = None, n=config.n_beta) success_utts_sample = pd.read_csv(success_utts_sample_path).utterance_id # Don't use failures for beta search hyperparam_search_results = sample_across_models.sample_across_models(success_utts_sample, [], fitting_dict, beta_sample, lambda_sample, gamma_sample, child_name = fitting_dict['training_dataset']) this_raw_beta_results = hyperparam_search_results.loc[hyperparam_search_results.likelihood_type == 'levdist'] this_raw_lambda_results = hyperparam_search_results.loc[hyperparam_search_results.likelihood_type == 'wfst'] this_raw_gamma_results = hyperparam_search_results.loc[hyperparam_search_results.likelihood_type == 'wfst-child'] # Log the beta results this_beta_results_surp = hyperparam_search_results.loc[hyperparam_search_results.likelihood_type == 'levdist'].groupby(['beta_value']).posterior_probability.agg(lambda x: np.mean(-1 * np.log(x))).reset_index() this_beta_results_surp = this_beta_results_surp.rename(columns = {'posterior_probability' : 'posterior_surprisal'}) beta_results_path = join(fitting_path, f'beta_search_results_{config.n_beta}.csv') this_beta_results_surp.to_csv(beta_results_path) print("Writing beta results to", {beta_results_path}) #plot_hyperparameter_optimization(fitting_path, 'beta', beta_sample, this_beta_results_surp['posterior_surprisal'], split_name, dataset_name) # Log the lamba results this_lambda_results_surp = hyperparam_search_results.loc[hyperparam_search_results.likelihood_type == 'wfst'].groupby(['lambda_value']).posterior_probability.agg(lambda x: np.mean(-1 * np.log(x)) ).reset_index() this_lambda_results_surp = this_lambda_results_surp.rename(columns = {'posterior_probability' : 'posterior_surprisal'}) lambda_results_path = join(fitting_path, f'lambda_search_results_{config.n_beta}.csv') this_lambda_results_surp.to_csv(lambda_results_path) print("Writing lambda results to", {lambda_results_path}) #plot_hyperparameter_optimization(fitting_path, 'lambda', lambda_sample, this_lambda_results_surp['posterior_surprisal'], split_name, dataset_name) # log the gamma results if necessary this_gamma_results_surp = hyperparam_search_results.loc[hyperparam_search_results.likelihood_type == 'wfst-child'].groupby(['gamma_value']).posterior_probability.agg(lambda x: np.mean(-1 * np.log(x))).reset_index() this_gamma_results_surp = this_gamma_results_surp.rename(columns = {'posterior_probability' : 'posterior_surprisal'}) gamma_results_path = join(fitting_path, f'gamma_search_results_{config.n_beta}.csv') this_gamma_results_surp.to_csv(gamma_results_path) print("Writing gamma results to", {gamma_results_path}) return this_raw_beta_results, this_beta_results_surp, this_raw_lambda_results, this_lambda_results_surp, this_raw_gamma_results, this_gamma_results_surp
def assemble_scores_no_order(hyperparameter_set): """ Load all of the non_child models for a given hyperparameter """ model_args = finetune_models = load_models.gen_finetune_model_args( ) + load_models.gen_shelf_model_args() + load_models.gen_unigram_args() score_store = [] for model_arg in model_args: model_arg['task_name'] = 'analysis' model_arg['task_phase'] = 'eval' model_arg['test_split'] = 'Providence' model_arg['test_dataset'] = 'all' model_arg['n_samples'] = config.n_across_time # loading from results_path = paths.get_directory(model_arg) search_string = join( results_path, hyperparameter_set + '_run_models_across_time_*.pkl') print('Searching ' + search_string) age_paths = glob.glob(search_string) for this_data_path in age_paths: #data_df = pd.read_pickle(this_data_path) with open(this_data_path, "rb") as fh: data_df = pickle.load(fh) data_df['training_split'] = model_arg['training_split'] data_df['training_dataset'] = model_arg['training_dataset'] data_df['test_split'] = model_arg['test_split'] data_df['test_dataset'] = model_arg['test_dataset'] data_df['model_type'] = model_arg['model_type'] data_df[ 'split'] = data_df.training_split + '_' + data_df.training_dataset data_df['model'] = paths.get_file_identifier(model_arg) score_store.append(data_df) return score_store
def get_model_from_split(model_dict): model_path = paths.get_directory(model_dict) word_info_all = get_cmu_dict_info() word_info = word_info_all.word try: model = BertForMaskedLM.from_pretrained(model_path) except BaseException as e: print('Model loading failed. Does a model actually exist at '+model_path) print(e) raise ValueError('Terminating!') model.eval() tokenizer = BertTokenizer.from_pretrained(model_path) softmax_mask, vocab = transformers_bert_completions.get_softmax_mask(tokenizer, word_info) return {'modelLM' : model, 'tokenizer' : tokenizer, 'softmax_mask' : softmax_mask, 'use_speaker_labels' : model_dict['use_tags']}
def get_run_mlm_command(training_split, training_dataset, use_tags, data_input_dir, model_output_dir, slurm_user): this_args_dict = config.child_args if training_split == 'Providence-Child' else config.general_training_args if training_split == 'Providence-Child': # load the best model base_model_spec = { 'task_name': 'child', 'task_phase' : 'train', 'training_split': 'Providence', 'training_dataset': 'all', 'test_split': None, 'test_dataset': None, 'model_type': 'BERT', 'use_tags': True, 'context_width': None, 'n_samples': config.n_across_time } base_model_path = paths.get_directory(base_model_spec) #models_get_split_folder('all', 'all', is_tags) else: base_model_path = 'bert-base-uncased' this_args_dict['model_name_or_path'] = base_model_path this_args_list = sorted(list(this_args_dict.keys())) # readability if base_model_spec['task_name'] == 'child': validation_filename = 'val.txt' elif base_model_spec['task_name'] == 'non_child': validation_filename = 'eval.txt' else: raise ValueError('task_name not recognized for MLM training') data_args = [ f"--train_file {data_input_dir}/train.txt", f"--validation_file {data_input_dir}/{validation_filename}", f"--cache_dir ~/.cache/$SLURM_JOB_ID", f"--output_dir {model_output_dir}", ] trainer_args = [ f"--{key} {this_args_dict[key]}" for key in this_args_list ] if config.dev_mode: trainer_args += [ f"--max_train_samples 10", f"--max_eval_samples 10", ] main_command = f"singularity exec --nv -B /om,/om2/user/{slurm_user} /om2/user/{slurm_user}/vagrant/ubuntu20.simg" this_python_command = f' python3 src/run/run_mlm.py {" ".join(data_args + trainer_args)}' return f"{main_command}{this_python_command}"
def assemble_child_scores_no_order(hyperparameter_set): """ Load all of the non_child models for a given hyperparameter """ task_name = 'analysis' task_phase = 'eval' child_names = load_splits.get_child_names() # cross each child with the Providence testing data for each other child child_arg_list = [] for training_child in child_names: for test_child in child_names: child_arg_list.append({ 'training_split': 'Providence-Child', 'training_dataset': training_child, 'test_split': 'Providence-Child', 'test_dataset': test_child, 'model_type': 'BERT', 'use_tags': True, 'context_width': 20, 'task_name': task_name, 'n_samples': config.n_across_time, 'task_phase': task_phase }) # Pretends that Switchboard is a kid and cross with the Providence testing data for each other child for test_child in child_names: child_arg_list.append({ 'training_split': 'Switchboard', 'training_dataset': 'all', 'test_split': 'Providence-Child', 'test_dataset': test_child, 'model_type': 'BERT', 'use_tags': False, 'context_width': 20, 'task_name': task_name, 'n_samples': config.n_across_time, 'task_phase': task_phase }) # Pretends that Switchboard is a kid and cross with the Providence testing data for each other child for test_child in child_names: child_arg_list.append({ 'training_split': 'Providence', 'training_dataset': 'all', 'test_split': 'Providence-Child', 'test_dataset': test_child, 'model_type': 'BERT', 'use_tags': True, 'context_width': 20, 'task_name': task_name, 'n_samples': config.n_across_time, 'task_phase': task_phase }) score_store = [] for model_arg in child_arg_list: model_arg['n_samples'] = config.n_across_time # loading from results_path = paths.get_directory(model_arg) search_string = os.path.join( results_path, hyperparameter_set + '_run_models_across_time_*.pkl') print('Searching ' + search_string) age_paths = glob.glob(search_string) single_model_store = [] for this_data_path in age_paths: #data_df = pd.read_pickle(this_data_path) with open(this_data_path, "rb") as fh: data_df = pickle.load(fh) data_df['training_split'] = model_arg['training_split'] data_df['training_dataset'] = model_arg['training_dataset'] data_df['test_split'] = model_arg['test_split'] data_df['test_dataset'] = model_arg['test_dataset'] data_df['model_type'] = model_arg['model_type'] data_df['model_type'] = model_arg['model_type'] data_df[ 'split'] = data_df.training_split + '_' + data_df.training_dataset data_df['model'] = paths.get_file_identifier(model_arg) single_model_store.append(copy.copy(data_df)) if len(single_model_store) > 0: score_store.append(pd.concat(single_model_store)) return score_store