Example #1
0
def gen_training_commands(spec_dict):

    paths.validate_spec_dict(spec_dict, config.spec_dict_params)
    paths.validate_phase(spec_dict['task_phase'], config.task_phases)

    mem_alloc_gb, time_alloc_hrs, n_tasks, cpus_per_task = scripts.get_training_alloc(
        spec_dict['training_dataset'])

    header_commands = scripts.gen_command_header(
        mem_alloc_gb=mem_alloc_gb,
        time_alloc_hrs=time_alloc_hrs,
        n_tasks=n_tasks,
        cpus_per_task=cpus_per_task,
        two_gpus=(spec_dict['training_dataset']
                  in {'Providence', 'Providence-Age'}))
    slurm_commands = header_commands

    # get the directory where this should be saved
    model_output_spec_dict = copy.copy(spec_dict)
    model_output_spec_dict['task_phase'] = 'train'

    model_output_dir = paths.get_directory(model_output_spec_dict)

    if not os.path.exists(model_output_dir):
        os.makedirs(model_output_dir)

    slurm_commands += [
        f"rm -rf {model_output_dir}\n"
    ]  # clear the directory in case it had stuff in it before. f in case it doesn't exist
    slurm_commands += [f"mkdir -p {model_output_dir}\n"
                       ]  # make the training directory if necessary
    slurm_commands += ["mkdir ~/.cache/$SLURM_JOB_ID\n"]

    data_input_spec_dict = copy.copy(spec_dict)
    data_input_spec_dict['task_phase'] = 'extract_data'

    data_input_dir = paths.get_directory(data_input_spec_dict)

    sh_loc = 'output/SLURM/' + spec_dict['task_name'] + '_' + spec_dict[
        'task_phase']

    if not os.path.exists(sh_loc):
        os.makedirs(sh_loc)

    # Construct the python/training-related commands
    slurm_commands.append(
        scripts.get_run_mlm_command(spec_dict['training_split'],
                                    spec_dict['training_dataset'],
                                    spec_dict['use_tags'], data_input_dir,
                                    model_output_dir, config.slurm_user))

    slurm_filename = os.path.join(sh_loc,
                                  paths.get_slurm_script_name(spec_dict))
    print(slurm_filename)

    return slurm_filename, slurm_commands
def gen_fitting_commands(fitting_spec_dict):

    paths.validate_spec_dict(fitting_spec_dict, config.spec_dict_params)
    paths.validate_phase(fitting_spec_dict['task_phase'], config.task_phases)

    mem_alloc_gb, time_alloc_hrs, n_tasks, cpus_per_task = scripts.get_training_alloc(
        fitting_spec_dict['training_dataset'])

    header_commands = scripts.gen_command_header(mem_alloc_gb=mem_alloc_gb,
                                                 time_alloc_hrs=time_alloc_hrs,
                                                 n_tasks=n_tasks,
                                                 cpus_per_task=cpus_per_task,
                                                 two_gpus=False)
    slurm_commands = header_commands

    fitting_output_path = paths.get_directory(fitting_spec_dict)
    if not exists(fitting_output_path):
        os.makedirs(fitting_output_path)

    slurm_commands += [
        f"rm -rf {fitting_output_path}\n"
    ]  # clear the directory in case it had stuff in it before
    slurm_commands += [f"mkdir -p {fitting_output_path}\n"
                       ]  # make the training directory if necessary
    slurm_commands += ["mkdir ~/.cache/$SLURM_JOB_ID\n"]

    model_input_spec_dict = copy.copy(fitting_spec_dict)
    model_input_spec_dict['task_phase'] = 'train'
    model_input_spec_dict['context_width'] = None
    model_input_spec_dict['test_split'] = None
    model_input_spec_dict['test_dataset'] = None
    model_input_dir = paths.get_directory(model_input_spec_dict)

    sh_loc = 'output/SLURM/' + fitting_spec_dict[
        'task_name'] + '_' + fitting_spec_dict['task_phase']

    if not exists(sh_loc):
        os.makedirs(sh_loc)

    sing_header = scripts.gen_singularity_header()
    # slurm_commands += [f"{sing_header} {gen_eval_scripts.get_one_python_command('src/run/run_beta_search.py', fitting_spec_dict['test_split'], fitting_spec_dict['test_dataset'] , fitting_spec_dict['use_tags'], fitting_spec_dict['context_width'], fitting_spec_dict['model_type'], fitting_spec_dict['training_dataset'], fitting_spec_dict['training_split'])[1]}\n"]

    slurm_commands += [
        f"{sing_header} {scripts.get_python_run_command('src/run/run_beta_search.py', fitting_spec_dict)}\n"
    ]

    slurm_filename = os.path.join(
        sh_loc, paths.get_slurm_script_name(fitting_spec_dict))

    return slurm_filename, slurm_commands
def get_optimal_hyperparameter_value(this_model_dict, hyperparameter):
    '''
        Get the best hyperparameter value for a given model x test dataset
    '''

    fitted_model_dict = copy.copy(this_model_dict)
    fitted_model_dict['task_phase'] = 'fit'

    fitted_model_path = paths.get_directory(fitted_model_dict)

    if hyperparameter == 'beta':
        n_hyperparameter = config.n_beta
    elif hyperparameter in ['lambda', 'gamma']:
        n_hyperparameter = config.n_lambda

    this_hyperparameter_results = pd.read_csv(
        join(fitted_model_path,
             hyperparameter + f'_search_results_{n_hyperparameter}.csv'))

    # # Need to argmax for beta_value, given the posterior surprisal
    list_hyperparameter_results = list(
        this_hyperparameter_results[hyperparameter + '_value'])
    list_surp = list(this_hyperparameter_results['posterior_surprisal'])

    argmin_hyperparameter = np.argmin(list_surp)
    best_hyperparameter = list_hyperparameter_results[argmin_hyperparameter]

    return best_hyperparameter
Example #4
0
def optimize_beta_and_lambda(fitting_dict):
    '''
        Find the values of beta and lambda which minimize posterior surprisal; save this information in a place that run_models_across_time can load

        Args:
        fitting dict: a dictionary with keys for training_split, training_dataset, test_split, test_dataset, etc. See utils/paths.py for a full description
        
        Return: the best parameter values for WFST and Levenshtein distance likelihoods; saves the scores for each hyperparameter value as a side effect

    '''

    beta_sample = hyperparameter_utils.get_hyperparameter_search_values('beta')
    lambda_sample = hyperparameter_utils.get_hyperparameter_search_values('lambda')
    gamma_sample = hyperparameter_utils.get_hyperparameter_search_values('lambda')    
        
    # initial_vocab determines the softmax mask used by BERT, leave it as mask for all evaluations/training
    
    initial_vocab, cmu_in_initial_vocab, cmu_indices_for_initial_vocab  = load_models.get_initial_vocab_info()
    fitting_path =  paths.get_directory(fitting_dict)    
    
    if not exists(fitting_path):
        os.makedirs(fitting_path)
    
    success_utts_sample_path = paths.get_sample_csv_path(task_phase_to_sample_for='fit', split=fitting_dict['test_split'], dataset=fitting_dict['test_dataset'], data_type='success', age = None, n=config.n_beta)
    success_utts_sample  = pd.read_csv(success_utts_sample_path).utterance_id
        
    # Don't use failures for beta search
    hyperparam_search_results = sample_across_models.sample_across_models(success_utts_sample, [], fitting_dict, beta_sample, lambda_sample, gamma_sample, child_name = fitting_dict['training_dataset'])
    
    this_raw_beta_results = hyperparam_search_results.loc[hyperparam_search_results.likelihood_type == 'levdist']
    this_raw_lambda_results = hyperparam_search_results.loc[hyperparam_search_results.likelihood_type == 'wfst']
    this_raw_gamma_results = hyperparam_search_results.loc[hyperparam_search_results.likelihood_type == 'wfst-child']    

    # Log the beta results
    this_beta_results_surp = hyperparam_search_results.loc[hyperparam_search_results.likelihood_type == 'levdist'].groupby(['beta_value']).posterior_probability.agg(lambda x: np.mean(-1 * np.log(x))).reset_index()
    this_beta_results_surp = this_beta_results_surp.rename(columns = {'posterior_probability' : 'posterior_surprisal'})
    beta_results_path = join(fitting_path, f'beta_search_results_{config.n_beta}.csv')
    this_beta_results_surp.to_csv(beta_results_path)
    print("Writing beta results to", {beta_results_path})
    #plot_hyperparameter_optimization(fitting_path, 'beta', beta_sample, this_beta_results_surp['posterior_surprisal'], split_name, dataset_name)
    
    
    # Log the lamba results
    this_lambda_results_surp = hyperparam_search_results.loc[hyperparam_search_results.likelihood_type == 'wfst'].groupby(['lambda_value']).posterior_probability.agg(lambda x: np.mean(-1 * np.log(x))
).reset_index()
    this_lambda_results_surp = this_lambda_results_surp.rename(columns = {'posterior_probability' : 'posterior_surprisal'})
    lambda_results_path = join(fitting_path, f'lambda_search_results_{config.n_beta}.csv')
    this_lambda_results_surp.to_csv(lambda_results_path)
    print("Writing lambda results to", {lambda_results_path})
    #plot_hyperparameter_optimization(fitting_path, 'lambda', lambda_sample, this_lambda_results_surp['posterior_surprisal'], split_name, dataset_name)
    
    # log the gamma results if necessary    
    this_gamma_results_surp = hyperparam_search_results.loc[hyperparam_search_results.likelihood_type == 'wfst-child'].groupby(['gamma_value']).posterior_probability.agg(lambda x: np.mean(-1 * np.log(x))).reset_index()
    this_gamma_results_surp = this_gamma_results_surp.rename(columns = {'posterior_probability' : 'posterior_surprisal'})
    gamma_results_path = join(fitting_path, f'gamma_search_results_{config.n_beta}.csv')
    this_gamma_results_surp.to_csv(gamma_results_path)
    print("Writing gamma results to", {gamma_results_path})

    return this_raw_beta_results, this_beta_results_surp, this_raw_lambda_results, this_lambda_results_surp, this_raw_gamma_results, this_gamma_results_surp    
def assemble_scores_no_order(hyperparameter_set):
    """
    Load all of the non_child models for a given hyperparameter
    """

    model_args = finetune_models = load_models.gen_finetune_model_args(
    ) + load_models.gen_shelf_model_args() + load_models.gen_unigram_args()

    score_store = []

    for model_arg in model_args:

        model_arg['task_name'] = 'analysis'
        model_arg['task_phase'] = 'eval'
        model_arg['test_split'] = 'Providence'
        model_arg['test_dataset'] = 'all'
        model_arg['n_samples'] = config.n_across_time

        # loading from
        results_path = paths.get_directory(model_arg)
        search_string = join(
            results_path, hyperparameter_set + '_run_models_across_time_*.pkl')
        print('Searching ' + search_string)
        age_paths = glob.glob(search_string)

        for this_data_path in age_paths:

            #data_df = pd.read_pickle(this_data_path)
            with open(this_data_path, "rb") as fh:
                data_df = pickle.load(fh)
                data_df['training_split'] = model_arg['training_split']
                data_df['training_dataset'] = model_arg['training_dataset']
                data_df['test_split'] = model_arg['test_split']
                data_df['test_dataset'] = model_arg['test_dataset']
                data_df['model_type'] = model_arg['model_type']

                data_df[
                    'split'] = data_df.training_split + '_' + data_df.training_dataset
                data_df['model'] = paths.get_file_identifier(model_arg)

            score_store.append(data_df)

    return score_store
def get_model_from_split(model_dict):
    
    model_path = paths.get_directory(model_dict)
        
    word_info_all = get_cmu_dict_info()
    word_info = word_info_all.word 
    
    try:
        model = BertForMaskedLM.from_pretrained(model_path)
    except BaseException as e:
        print('Model loading failed. Does a model actually exist at '+model_path)
        print(e)
        raise ValueError('Terminating!')

    
    model.eval()
    tokenizer = BertTokenizer.from_pretrained(model_path)
    softmax_mask, vocab = transformers_bert_completions.get_softmax_mask(tokenizer, word_info)
    
    return {'modelLM' : model, 'tokenizer' : tokenizer, 'softmax_mask' : softmax_mask, 'use_speaker_labels' : model_dict['use_tags']}
def get_run_mlm_command(training_split, training_dataset, use_tags, data_input_dir, model_output_dir, slurm_user):    
    
    this_args_dict = config.child_args if training_split == 'Providence-Child' else config.general_training_args
    
    if training_split == 'Providence-Child':        
        
        # load the best model
        base_model_spec = {
            'task_name': 'child',
            'task_phase' : 'train',
            'training_split': 'Providence',
            'training_dataset': 'all',
            'test_split': None,
            'test_dataset': None,
            'model_type': 'BERT',
            'use_tags': True,
            'context_width': None,
            'n_samples':  config.n_across_time                
        }            

        base_model_path = paths.get_directory(base_model_spec)
        #models_get_split_folder('all', 'all', is_tags)
    
    else:
        base_model_path = 'bert-base-uncased'
        
    this_args_dict['model_name_or_path'] = base_model_path

    
    this_args_list = sorted(list(this_args_dict.keys())) # readability
    

    if base_model_spec['task_name'] == 'child':
        validation_filename = 'val.txt'
    elif base_model_spec['task_name'] == 'non_child':
        validation_filename = 'eval.txt'
    else:
        raise ValueError('task_name not recognized for MLM training')

    data_args = [
            f"--train_file {data_input_dir}/train.txt",
            f"--validation_file {data_input_dir}/{validation_filename}", 
            f"--cache_dir ~/.cache/$SLURM_JOB_ID",
            f"--output_dir {model_output_dir}",
        ]
    
    trainer_args = [
        f"--{key} {this_args_dict[key]}"
        for key in this_args_list
    ]
    
    if config.dev_mode:
        trainer_args += [
            f"--max_train_samples 10",
            f"--max_eval_samples 10",
        ]

    main_command = f"singularity exec --nv -B /om,/om2/user/{slurm_user} /om2/user/{slurm_user}/vagrant/ubuntu20.simg"
    this_python_command = f' python3 src/run/run_mlm.py {" ".join(data_args + trainer_args)}'
    
    return f"{main_command}{this_python_command}"
def assemble_child_scores_no_order(hyperparameter_set):
    """
    Load all of the non_child models for a given hyperparameter
    """

    task_name = 'analysis'
    task_phase = 'eval'
    child_names = load_splits.get_child_names()

    # cross each child with the Providence testing data for each other child
    child_arg_list = []
    for training_child in child_names:
        for test_child in child_names:
            child_arg_list.append({
                'training_split': 'Providence-Child',
                'training_dataset': training_child,
                'test_split': 'Providence-Child',
                'test_dataset': test_child,
                'model_type': 'BERT',
                'use_tags': True,
                'context_width': 20,
                'task_name': task_name,
                'n_samples': config.n_across_time,
                'task_phase': task_phase
            })

    # Pretends that Switchboard is a kid and cross with the Providence testing data for each other child
    for test_child in child_names:
        child_arg_list.append({
            'training_split': 'Switchboard',
            'training_dataset': 'all',
            'test_split': 'Providence-Child',
            'test_dataset': test_child,
            'model_type': 'BERT',
            'use_tags': False,
            'context_width': 20,
            'task_name': task_name,
            'n_samples': config.n_across_time,
            'task_phase': task_phase
        })

    # Pretends that Switchboard is a kid and cross with the Providence testing data for each other child
    for test_child in child_names:
        child_arg_list.append({
            'training_split': 'Providence',
            'training_dataset': 'all',
            'test_split': 'Providence-Child',
            'test_dataset': test_child,
            'model_type': 'BERT',
            'use_tags': True,
            'context_width': 20,
            'task_name': task_name,
            'n_samples': config.n_across_time,
            'task_phase': task_phase
        })

    score_store = []

    for model_arg in child_arg_list:

        model_arg['n_samples'] = config.n_across_time

        # loading from
        results_path = paths.get_directory(model_arg)

        search_string = os.path.join(
            results_path, hyperparameter_set + '_run_models_across_time_*.pkl')
        print('Searching ' + search_string)
        age_paths = glob.glob(search_string)

        single_model_store = []
        for this_data_path in age_paths:

            #data_df = pd.read_pickle(this_data_path)
            with open(this_data_path, "rb") as fh:
                data_df = pickle.load(fh)
                data_df['training_split'] = model_arg['training_split']
                data_df['training_dataset'] = model_arg['training_dataset']
                data_df['test_split'] = model_arg['test_split']
                data_df['test_dataset'] = model_arg['test_dataset']
                data_df['model_type'] = model_arg['model_type']
                data_df['model_type'] = model_arg['model_type']

                data_df[
                    'split'] = data_df.training_split + '_' + data_df.training_dataset
                data_df['model'] = paths.get_file_identifier(model_arg)

                single_model_store.append(copy.copy(data_df))

        if len(single_model_store) > 0:
            score_store.append(pd.concat(single_model_store))

    return score_store