Esempio n. 1
0
def get_child_names():
    """
    Get all Providence children.
    """
    
    all_phono = load_splits.load_phono()
    return sorted(list(set(all_phono.target_child_name)))
Esempio n. 2
0
def load_cross_data(child_name):
    
    all_phono = load_splits.load_phono()
    child_phono = all_phono[all_phono.target_child_name == child_name]
    this_phono = child_phono[child_phono.phase_child_sample == config.eval_phase]
    
    return this_phono
Esempio n. 3
0
def sample_across_models(success_ids, yyy_ids, model, beta_values, lambda_values, examples_mode = False, all_tokens_phono=None):
    '''

        Efficiently compute posterior values computing to different parameterizations of the likelihood. Retrieve the priors once for a given model, compute the distances or WFST path lengths once, and then iterate over a range for the scaling parameter

        Args: 
        success_ids: utterance ids for utterances identified as communicative successes
        yyy_ids: utterance ids for utterances identified as communicative failures
        model: A model dictionary from the load models functions (not a HuggingFace model alone!)
        beta_values: a vector of scaling parameters to test for the Levenshtein distance
        lambda_values: a vector of scaling parameters to test for the WFST distance
        examples_mode: return extra information about the top 10 completions, appropriate for generating the example table in the paper, otherwise very memory intensive
        all_tokens_phono: for the examples table, moving the loading of phono up a level in the call stack avoids repeated data running

        Return
        A dataframe with all tokens scored for all models

    '''
     
    if all_tokens_phono is None:
        all_tokens_phono = load_splits.load_phono()
    this_bert_token_ids = all_tokens_phono.loc[all_tokens_phono.partition.isin(('success','yyy'))].bert_token_id
    initial_vocab, cmu_2syl_inchildes, cmu_indices_for_initial_vocab = load_models.get_initial_vocab_info()

    print('Running model '+model['title']+'...')

    # get the priors
    if model['type'] == 'BERT':
        priors_for_age_interval = transformers_bert_completions.compare_successes_failures(
            all_tokens_phono, success_ids, 
            yyy_ids, **model['kwargs'])

    elif model['type'] == 'unigram':
        priors_for_age_interval = transformers_bert_completions.compare_successes_failures_unigram_model(
            all_tokens_phono, success_ids, 
            yyy_ids, **model['kwargs'])
      
    score_store_single_model = []

    
    print('Computing WFST path lengths...')
    wfst_distances_for_age_interval_unreduced, ipa = wfst.get_wfst_distance_matrix(all_tokens_phono, priors_for_age_interval, initial_vocab,  cmu_2syl_inchildes, config.fst_path, config.fst_sym_path)    
    wfst_distances_for_age_interval_unreduced = -1 * np.log(wfst_distances_for_age_interval_unreduced + 10**-20) # convert this back to log space

    #for each word, find the citation pronunciation that is most likely to generate the observed data 
    wfst_distances_for_age_interval = wfst.reduce_duplicates(wfst_distances_for_age_interval_unreduced, cmu_2syl_inchildes, initial_vocab, 'min', cmu_indices_for_initial_vocab) # min for smallest surprisal
    
    import pdb
    pdb.set_trace()

    for idx, lambda_value in enumerate(lambda_values):
        
        print(f'Processing lambda value {idx + 1} of {config.lambda_num_values}')

        # get the posteriors        
        if model['type'] == 'BERT':
            posteriors_for_age_interval = transformers_bert_completions.get_posteriors(priors_for_age_interval, 
                wfst_distances_for_age_interval, initial_vocab, None, lambda_value, examples_mode = examples_mode)

        elif model['type'] == 'unigram':
            posteriors_for_age_interval = transformers_bert_completions.get_posteriors(priors_for_age_interval, wfst_distances_for_age_interval, initial_vocab, this_bert_token_ids, lambda_value, examples_mode = examples_mode)
            print('If possible compare the bert_token_id in sample_across_models to the bert_token_id in one of the other scores sets from bert.')
            
        posteriors_for_age_interval['scores']['lambda_value'] = lambda_value
        posteriors_for_age_interval['scores']['model'] = model['title']
        posteriors_for_age_interval['scores']['likelihood_type'] = 'wfst'
        
        posteriors_for_age_interval['scores'].astype({'lambda_value' : 'float16'})
        this_score = copy.deepcopy(posteriors_for_age_interval['scores'])
        
        score_store_single_model.append(this_score)    


    print('Computing edit distances...')
    edit_distances_for_age_interval_unreduced = transformers_bert_completions.get_edit_distance_matrix(all_tokens_phono, priors_for_age_interval, cmu_2syl_inchildes)

    #for each word, find the citation pronunciation that is most likely to generate the observed data. Look for the one with the *smallest* edit distance     
    edit_distances_for_age_interval = wfst.reduce_duplicates(edit_distances_for_age_interval_unreduced, cmu_2syl_inchildes, initial_vocab, 'min', cmu_indices_for_initial_vocab)

    
    for idx, beta_value in enumerate(beta_values):
        
        print(f'Processing beta value {idx + 1} of {config.beta_num_values}')

        # get the posteriors        
        if model['type'] == 'BERT':
            posteriors_for_age_interval = transformers_bert_completions.get_posteriors(priors_for_age_interval, 
                edit_distances_for_age_interval, initial_vocab, None, beta_value, examples_mode = examples_mode)

        elif model['type'] == 'unigram':
            # special unigram hack
            
            posteriors_for_age_interval = transformers_bert_completions.get_posteriors(priors_for_age_interval, edit_distances_for_age_interval, 
                initial_vocab, this_bert_token_ids, beta_value, examples_mode = examples_mode)
            print('If possible compare the bert_token_id in sample_across_models to the bert_token_id in one of the other scores sets from bert.')
            
        posteriors_for_age_interval['scores']['beta_value'] = beta_value
        posteriors_for_age_interval['scores']['model'] = model['title']
        posteriors_for_age_interval['scores']['likelihood_type'] = 'levdist'
        
        posteriors_for_age_interval['scores'].astype({'beta_value' : 'float16'})
        this_score = copy.deepcopy(posteriors_for_age_interval['scores'])
        
        score_store_single_model.append(this_score)    

    all_scores = pd.concat(score_store_single_model)
    
    return all_scores 
Esempio n. 4
0
        help=
        "Whether to include speaker tags. This should only be used as True with the CHILDES models"
    )

    # 7/7/21: https://stackoverflow.com/questions/17118999/python-argparse-unrecognized-arguments
    raw_args = parser.parse_known_args()[0]
    # Not sure why known args is necessary here.

    # parsers.check_args(raw_args)

    this_model_args = vars(raw_args)
    this_model_args['task_phase'] = 'eval'
    this_model_args['n_samples'] = config.n_across_time
    print(this_model_args)

    all_phono = load_splits.load_phono()

    # this logic needs to be tested

    if (this_model_args['test_split']
            == 'Providence') and (this_model_args['test_dataset'] == 'all'):
        this_sample_dict = load_splits.load_sample_model_across_time_args()

    elif (this_model_args['test_split'] == 'Providence-Child'):
        #think about where this was in the notebooks

        this_sample_dict = {}

        eval_samples = all_phono.loc[
            (all_phono.phase_child_sample == 'eval')
            & (all_phono.target_child_name == this_model_args['test_dataset'])]
Esempio n. 5
0
def get_scores_across_models(test_idx, which_models, is_success):
    '''
    Get scores across a selection of models appropriate for an example figure. Looks at the results of run_beta_search to choose the best hyperparameter settings
    
    test_idx: utterance index
    which_models: selection of model specifications to run
    is_success: is the test_idx a communicative success (True) or communicative failure (False)

    '''

    scores_across_models = []
    success_ids, yyy_ids = [], []

    if is_success:
        success_ids = [test_idx]
    else:
        yyy_ids = [test_idx]

    all_tokens_phono = load_splits.load_phono()

    for args_extract in which_models:

        model_dict = load_models.get_model_dict(*args_extract)

        config.fail_on_lambda_edge = False
        config.fail_on_beta_edge = False

        optimal_lambda_value = [
            hyperparameter_utils.get_optimal_hyperparameter_value(
                *args_extract, 'lambda')
        ]
        if config.fail_on_lambda_edge:
            if optimal_lambda_value[0] >= config.lambda_high:
                raise ValueError(
                    'Lambda value is too high; examine the range for WFST scaling.'
                )
            if optimal_lambda_value[0] <= config.lambda_low:
                raise ValueError(
                    'Lambda value is too low; examine the range for WFST Distance scaling.'
                )

        optimal_beta_value = [
            hyperparameter_utils.get_optimal_hyperparameter_value(
                *args_extract, 'beta')
        ]
        if config.fail_on_beta_edge:
            if optimal_beta_value[0] >= config.beta_high:
                raise ValueError(
                    'Beta value is too high; examine the range for Levenshtein Distance scaling.'
                )
            if optimal_beta_value[0] <= config.beta_low:
                raise ValueError(
                    'Beta value is too low; examine the range for Levenshtein Distance scaling.'
                )

        this_scoring = sample_across_models.sample_across_models(
            success_ids,
            yyy_ids,
            model_dict,
            optimal_beta_value,
            optimal_lambda_value,
            examples_mode=True,
            all_tokens_phono=all_tokens_phono)

        scores_across_models.append(this_scoring)

    return scores_across_models