コード例 #1
0
def get_finetune_dict(fitted_dict):

    # make a model dict 
    model_dict = copy.copy(fitted_dict)
    model_dict['task_phase'] = 'train'
    model_dict['test_dataset'] = None
    model_dict['test_split'] = None  
    model_dict['context_width'] = None  


    fitted_dict['title'] =  paths.get_file_identifier(fitted_dict)
    fitted_dict['kwargs'] = get_model_from_split(model_dict)    
    fitted_dict['kwargs']['context_width_in_utts'] = fitted_dict['context_width']
    fitted_dict['kwargs']['use_speaker_labels'] = fitted_dict['use_tags']
    return fitted_dict
コード例 #2
0
def get_data_unigram_dict(fitted_dict):
    
    adult_tokenizer, adult_softmax_mask, _, initial_vocab = get_vocab_tok_modules()
    

    fitted_dict['title'] = paths.get_file_identifier(fitted_dict)
    fitted_dict['kwargs'] = {'child_counts_path': f'{config.finetune_dir}/all/all/chi_vocab_train.csv',
                    'tokenizer': adult_tokenizer,
                    'softmax_mask': adult_softmax_mask,
                    'vocab': initial_vocab,

                    # Added these default args 7/9/21 for compatibility with rest of the code
                    'context_width_in_utts': 0,
                    'use_speaker_labels': False,
                   }
    return(fitted_dict)
コード例 #3
0
def get_shelf_dict(fitted_dict):
    """
    Adult BERT models, no finetuning
    """
    
    adult_bertMaskedLM = BertForMaskedLM.from_pretrained('bert-base-uncased')
    adult_bertMaskedLM.eval()
    
    adult_tokenizer, adult_softmax_mask, _, _ = get_vocab_tok_modules()
    
    fitted_dict['title'] = paths.get_file_identifier(fitted_dict)
    fitted_dict['kwargs'] = {'modelLM': adult_bertMaskedLM,
                        'tokenizer': adult_tokenizer,
                        'softmax_mask': adult_softmax_mask,
                        'context_width_in_utts': fitted_dict['context_width'],
                       'use_speaker_labels':fitted_dict['use_tags']
                       }
    return(fitted_dict)
コード例 #4
0
def get_flat_unigram_dict(fitted_dict):
    
    adult_tokenizer, adult_softmax_mask, _, initial_vocab = get_vocab_tok_modules()
    
    fitted_dict['title'] = paths.get_file_identifier(fitted_dict)
            # Note that this assumes that flat prior = no information at all.
            # That means it doesn't observe any train/val split.
            # It just assigns uniform probability to every single word,
            # regardless of if that word appears in the train set or not. 
    fitted_dict['kwargs'] = {'child_counts_path': None,
                        'tokenizer': adult_tokenizer,
                        'softmax_mask': adult_softmax_mask,
                        'vocab': initial_vocab,
                       
                        # Added these default args 7/9/21 for compatibility with rest of the code
                        'context_width_in_utts': 0,
                        'use_speaker_labels': False,
                       }
    return(fitted_dict)
コード例 #5
0
def assemble_scores_no_order(hyperparameter_set):
    """
    Load all of the non_child models for a given hyperparameter
    """

    model_args = finetune_models = load_models.gen_finetune_model_args(
    ) + load_models.gen_shelf_model_args() + load_models.gen_unigram_args()

    score_store = []

    for model_arg in model_args:

        model_arg['task_name'] = 'analysis'
        model_arg['task_phase'] = 'eval'
        model_arg['test_split'] = 'Providence'
        model_arg['test_dataset'] = 'all'
        model_arg['n_samples'] = config.n_across_time

        # loading from
        results_path = paths.get_directory(model_arg)
        search_string = join(
            results_path, hyperparameter_set + '_run_models_across_time_*.pkl')
        print('Searching ' + search_string)
        age_paths = glob.glob(search_string)

        for this_data_path in age_paths:

            #data_df = pd.read_pickle(this_data_path)
            with open(this_data_path, "rb") as fh:
                data_df = pickle.load(fh)
                data_df['training_split'] = model_arg['training_split']
                data_df['training_dataset'] = model_arg['training_dataset']
                data_df['test_split'] = model_arg['test_split']
                data_df['test_dataset'] = model_arg['test_dataset']
                data_df['model_type'] = model_arg['model_type']

                data_df[
                    'split'] = data_df.training_split + '_' + data_df.training_dataset
                data_df['model'] = paths.get_file_identifier(model_arg)

            score_store.append(data_df)

    return score_store
コード例 #6
0
def assemble_child_scores_no_order(hyperparameter_set):
    """
    Load all of the non_child models for a given hyperparameter
    """

    task_name = 'analysis'
    task_phase = 'eval'
    child_names = load_splits.get_child_names()

    # cross each child with the Providence testing data for each other child
    child_arg_list = []
    for training_child in child_names:
        for test_child in child_names:
            child_arg_list.append({
                'training_split': 'Providence-Child',
                'training_dataset': training_child,
                'test_split': 'Providence-Child',
                'test_dataset': test_child,
                'model_type': 'BERT',
                'use_tags': True,
                'context_width': 20,
                'task_name': task_name,
                'n_samples': config.n_across_time,
                'task_phase': task_phase
            })

    # Pretends that Switchboard is a kid and cross with the Providence testing data for each other child
    for test_child in child_names:
        child_arg_list.append({
            'training_split': 'Switchboard',
            'training_dataset': 'all',
            'test_split': 'Providence-Child',
            'test_dataset': test_child,
            'model_type': 'BERT',
            'use_tags': False,
            'context_width': 20,
            'task_name': task_name,
            'n_samples': config.n_across_time,
            'task_phase': task_phase
        })

    # Pretends that Switchboard is a kid and cross with the Providence testing data for each other child
    for test_child in child_names:
        child_arg_list.append({
            'training_split': 'Providence',
            'training_dataset': 'all',
            'test_split': 'Providence-Child',
            'test_dataset': test_child,
            'model_type': 'BERT',
            'use_tags': True,
            'context_width': 20,
            'task_name': task_name,
            'n_samples': config.n_across_time,
            'task_phase': task_phase
        })

    score_store = []

    for model_arg in child_arg_list:

        model_arg['n_samples'] = config.n_across_time

        # loading from
        results_path = paths.get_directory(model_arg)

        search_string = os.path.join(
            results_path, hyperparameter_set + '_run_models_across_time_*.pkl')
        print('Searching ' + search_string)
        age_paths = glob.glob(search_string)

        single_model_store = []
        for this_data_path in age_paths:

            #data_df = pd.read_pickle(this_data_path)
            with open(this_data_path, "rb") as fh:
                data_df = pickle.load(fh)
                data_df['training_split'] = model_arg['training_split']
                data_df['training_dataset'] = model_arg['training_dataset']
                data_df['test_split'] = model_arg['test_split']
                data_df['test_dataset'] = model_arg['test_dataset']
                data_df['model_type'] = model_arg['model_type']
                data_df['model_type'] = model_arg['model_type']

                data_df[
                    'split'] = data_df.training_split + '_' + data_df.training_dataset
                data_df['model'] = paths.get_file_identifier(model_arg)

                single_model_store.append(copy.copy(data_df))

        if len(single_model_store) > 0:
            score_store.append(pd.concat(single_model_store))

    return score_store
コード例 #7
0
def get_scores_across_models(test_idx, model_dicts, is_success):
    '''
    Get scores across a selection of models appropriate for an example figure. Looks at the results of run_beta_search to choose the best hyperparameter settings
    
    test_idx: utterance index
    which_models: selection of model specifications to run
    is_success: is the test_idx a communicative success (True) or communicative failure (False)

    '''

    scores_across_models = []
    success_ids, yyy_ids = [], []

    if is_success:
        success_ids = [test_idx]
    else:
        yyy_ids = [test_idx]

    all_tokens_phono = load_splits.load_phono()

    for model_dict in model_dicts:

        # need to specify the test data so that it can load the appropriate model
        model_dict['task_name'] = 'analysis'
        model_dict['task_phase'] = 'eval'
        model_dict['test_split'] = 'Providence'
        model_dict['test_dataset'] = 'all'
        model_dict['n_samples'] = config.n_across_time
        model_dict['title'] = paths.get_file_identifier(model_dict)
        model_dict['examples_mode'] = True

        config.fail_on_lambda_edge = False
        config.fail_on_beta_edge = False

        optimal_lambda_value = [
            hyperparameter_utils.get_optimal_hyperparameter_value(
                model_dict, 'lambda')
        ]
        if config.fail_on_lambda_edge:
            if optimal_lambda_value[0] >= config.lambda_high:
                raise ValueError(
                    'Lambda value is too high; examine the range for WFST scaling.'
                )
            if optimal_lambda_value[0] <= config.lambda_low:
                raise ValueError(
                    'Lambda value is too low; examine the range for WFST Distance scaling.'
                )

        optimal_beta_value = [
            hyperparameter_utils.get_optimal_hyperparameter_value(
                model_dict, 'beta')
        ]
        if config.fail_on_beta_edge:
            if optimal_beta_value[0] >= config.beta_high:
                raise ValueError(
                    'Beta value is too high; examine the range for Levenshtein Distance scaling.'
                )
            if optimal_beta_value[0] <= config.beta_low:
                raise ValueError(
                    'Beta value is too low; examine the range for Levenshtein Distance scaling.'
                )

        this_model_dict = load_models.get_fitted_model_dict(model_dict)

        best_beta_scores = sample_models_across_time.successes_and_failures_across_time_per_model(
            0, success_ids, yyy_ids, this_model_dict, all_tokens_phono,
            optimal_beta_value[0], 'levdist')
        best_beta_scores['likelihood_type'] = 'levdist'
        best_beta_scores['model'] = model_dict['title']
        scores_across_models.append(best_beta_scores)

        best_lambda_scores = sample_models_across_time.successes_and_failures_across_time_per_model(
            0, success_ids, yyy_ids, this_model_dict, all_tokens_phono,
            optimal_lambda_value[0], 'wfst')
        best_lambda_scores['likelihood_type'] = 'wfst'
        best_lambda_scores['model'] = model_dict['title']
        scores_across_models.append(best_lambda_scores)

    return scores_across_models