Ejemplo n.º 1
0
def test_generate_specs_from_data_no_subset_specs():
    np.random.seed(10)
    data = {'Grammar': np.random.randn(10), 'Fluency': np.random.randn(10), 'Discourse': np.random.randn(10), 'r1': np.random.choice(4, 10), 'spkitemlab': ['a-5'] * 10}
    df = pd.DataFrame(data)
    df = pd.DataFrame(data)
    specs = generate_specs_from_data(['Grammar', 'Fluency', 'Discourse'],
                                     'r1',
                                     df)
    feats = specs['features']
    assert_equal(len(feats), 3)
    assert_array_equal([f['feature'] for f in feats], ['Grammar', 'Fluency', 'Discourse'])
    assert_equal(feats[0]['sign'], 1)
    assert_equal(feats[1]['sign'], 1)
    assert_equal(feats[2]['sign'], 1)

    
Ejemplo n.º 2
0
def test_generate_specs_from_data_with_transformation():
    feature_subset_specs = pd.DataFrame({'Feature': ['Grammar',
                                                     'Vocabulary',
                                                     'Fluency',
                                                     'Content_coverage',
                                                     'Discourse'],
                                        'Sign_SYS1': ['-', '+', '+', '+', '-']})
    np.random.seed(10)
    r1 = np.random.choice(range(1, 5), 10)
    data = {'Grammar': np.random.randn(10), 'Vocabulary': r1**2, 'Discourse': np.random.randn(10), 'r1': r1, 'spkitemlab': ['a-5'] * 10}
    df = pd.DataFrame(data)
    specs = generate_specs_from_data(['Grammar', 'Vocabulary', 'Discourse'],
                                     'r1',
                                     df,
                                     feature_subset_specs,
                                     'SYS1')
    feats = specs['features']
    assert_equal(feats[1]['feature'], 'Vocabulary')
    assert_equal(feats[1]['transform'], 'sqrt')
Ejemplo n.º 3
0
def test_generate_specs_from_data_with_negative_sign():
    feature_subset_specs = pd.DataFrame({'Feature': ['Grammar',
                                                     'Vocabulary',
                                                     'Fluency',
                                                     'Content_coverage',
                                                     'Discourse'],
                                        'Sign_SYS1': ['-', '+', '+', '+', '-']})
    np.random.seed(10)
    data = {'Grammar': np.random.randn(10), 'Fluency': np.random.randn(10), 'Discourse': np.random.randn(10), 'r1': np.random.choice(4, 10), 'spkitemlab': ['a-5'] * 10}
    df = pd.DataFrame(data)
    specs = generate_specs_from_data(['Grammar', 'Fluency', 'Discourse'],
                                     'r1',
                                     df,
                                     feature_subset_specs,
                                     'SYS1')
    feats = specs['features']
    assert_equal(len(feats), 3)
    assert_array_equal([f['feature'] for f in feats], ['Grammar', 'Fluency', 'Discourse'])
    assert_equal(feats[0]['sign'], -1)
    assert_equal(feats[1]['sign'], 1)
    assert_equal(feats[2]['sign'], -1)
Ejemplo n.º 4
0
def load_experiment_data(main_config_file, outdir):

    """
    Set up the experiment by loading the training
    and evaluation data sets and preprocessing them.
    """

    logger = logging.getLogger(__name__)

    # read in the main config file
    logger.info('Reading configuration file: {}'.format(main_config_file))
    config_obj = read_json_file(main_config_file)
    config_obj = check_main_config(config_obj)

    # get the directory where the config file lives
    configpath = dirname(main_config_file)

    # get the experiment ID
    experiment_id = config_obj['experiment_id']

    # get the description
    description = config_obj['description']

    # get the column name for the labels for the training and testing data
    train_label_column = config_obj['train_label_column']
    test_label_column = config_obj['test_label_column']

    # get the column name that will hold the ID for
    # both the training and the test data
    id_column = config_obj['id_column']

    # get the specified trim min and max values
    spec_trim_min, spec_trim_max = get_trim_min_max(config_obj)

    # get the name of the optional column that
    # contains response length.
    length_column = config_obj['length_column']

    # get the name of the optional column that
    # contains the second human score
    second_human_score_column = config_obj['second_human_score_column']

    # get the name of the optional column that
    # contains the candidate ID
    candidate_column = config_obj['candidate_column']

    # if the test label column is the same as the
    # second human score column, raise an error
    if test_label_column == second_human_score_column:
        raise ValueError("'test_label_column' and "
                         "'second_human_score_column' cannot have the "
                         "same value.")

    # get the name of the model that we want to train and
    # check that it's valid
    model_name = config_obj['model']
    model_type = check_model_name(model_name)

    # are we excluding zero scores?
    exclude_zero_scores = config_obj['exclude_zero_scores']

    # if we are excluding zero scores but trim_min
    # is set to 0, then we need to warn the user
    if exclude_zero_scores and spec_trim_min == 0:
        logger.warning("'exclude_zero_scores' is set to True but "
                       "'trim_min' is set to 0. This may cause "
                       " unexpected behavior.")

    # are we filtering on any other columns?
    flag_column_dict = check_flag_column(config_obj)

    # are we generating fake labels?
    use_fake_train_labels = train_label_column == 'fake'
    use_fake_test_labels = test_label_column == 'fake'

    # are we analyzing scaled or raw prediction values
    use_scaled_predictions = config_obj['use_scaled_predictions']

    # get the subgroups if any
    subgroups = config_obj.get('subgroups')

    # are there specific general report sections we want to include?
    general_report_sections = config_obj['general_sections']

    # what about the special or custom sections?
    special_report_sections = config_obj['special_sections']

    custom_report_section_paths = config_obj['custom_sections']

    if custom_report_section_paths:
        logger.info('Locating custom report sections')
        custom_report_sections = locate_custom_sections(custom_report_section_paths,
                                                         configpath)
    else:
        custom_report_sections = []

    section_order = config_obj['section_order']

    chosen_notebook_files = get_ordered_notebook_files(general_report_sections,
                                                       special_report_sections,
                                                       custom_report_sections,
                                                       section_order,
                                                       subgroups,
                                                       model_type=model_type,
                                                       context='rsmtool')

    # Read in the feature configurations.
    # Location of feature file
    feature_field = config_obj['features']

    # Check whether feature subset file exists and whether we are using
    # feature subset of prefix
    feature_subset_file = config_obj['feature_subset_file']
    if feature_subset_file:
        feature_subset_file_location = locate_file(feature_subset_file, configpath)
        if not feature_subset_file_location:
            raise FileNotFoundError('Feature subset file {} not '
                                    'found.\n'.format(config_obj['feature_subset_file']))

    feature_subset = config_obj['feature_subset']
    feature_prefix = config_obj['feature_prefix']

    # if the user requested feature_subset file and feature subset,
    # read the file and check its format
    if feature_subset_file and feature_subset:
        feature_subset_specs = pd.read_csv(feature_subset_file_location)
        check_feature_subset_file(feature_subset_specs, feature_subset)
    else:
        feature_subset_specs = None


    # Do we need to automatically find the best transformations/change sign?
    select_transformations = config_obj['select_transformations']
    feature_sign = config_obj['sign']

    requested_features = []
    feature_specs = {}
    select_features_automatically = True

    # For backward compatibility, we check whether this field can
    # be set to all and set the select_transformations to true
    # as was done in the previous version.
    if feature_field == 'all':
        select_transformations = True
    elif feature_field is not None:
        feature_file_location = locate_file(feature_field, configpath)
        select_features_automatically = False
        if not feature_file_location:
            raise FileNotFoundError('Feature file {} not '
                                    'found.\n'.format(config_obj['features']))
        else:
            logger.info('Reading feature file: {}'.format(feature_file_location))
            feature_json = read_json_file(feature_file_location)
            feature_specs = normalize_and_validate_feature_file(feature_json)
            requested_features = [fdict['feature'] for fdict in feature_specs['features']]

    # check to make sure that `length_column` or `second_human_score_column`
    # are not also included in the requested features, if they are specified
    if (length_column and
        length_column in requested_features):
        raise ValueError("The value of 'length_column' ('{}') cannot be "
                         "used as a model feature.".format(length_column))

    if (second_human_score_column and
        second_human_score_column in requested_features):
        raise ValueError("The value of 'second_human_score_column' ('{}') cannot be "
                         "used as a model feature.".format(second_human_score_column))

    # Specify column names that cannot be used as features
    reserved_column_names = list(set(['spkitemid', 'spkitemlab',
                                      'itemType', 'r1', 'r2', 'score',
                                      'sc', 'sc1', 'adj',
                                      train_label_column,
                                      test_label_column,
                                      id_column] + subgroups + list(flag_column_dict.keys())))

    # if `second_human_score_column` is specified, then
    # we need to add `sc2` to the list of reserved column
    # names. And same for 'length' and 'candidate', if `length_column`
    # and `candidate_column` are specified
    if second_human_score_column:
        reserved_column_names.append('sc2')
    if length_column:
        reserved_column_names.append('length')
    if candidate_column:
        reserved_column_names.append('candidate')

    # Make sure that the training data as specified in the
    # config file actually exists on disk and if it does,
    # load it and filter out the bad rows and features with
    # zero standard deviation. Also double check that the requested
    # features exist in the data or obtain the feature names if
    # no feature file was given.
    train_file_location = locate_file(config_obj['train_file'], configpath)
    if not train_file_location:
        raise FileNotFoundError('Error: Training file {} '
                                'not found.\n'.format(config_obj['train_file']))
    else:
        logger.info('Reading training data: {}'.format(train_file_location))

    (df_train_features,
     df_train_metadata,
     df_train_other_columns,
     df_train_excluded,
     df_train_length,
     _,
     df_train_flagged_responses,
     used_trim_min,
     used_trim_max,
     feature_names) = load_and_filter_data(train_file_location,
                                           train_label_column,
                                           id_column,
                                           length_column,
                                           None,
                                           candidate_column,
                                           requested_features,
                                           reserved_column_names,
                                           spec_trim_min,
                                           spec_trim_max,
                                           flag_column_dict,
                                           subgroups,
                                           exclude_zero_scores=exclude_zero_scores,
                                           exclude_zero_sd=True,
                                           feature_subset_specs=feature_subset_specs,
                                           feature_subset=feature_subset,
                                           feature_prefix=feature_prefix,
                                           use_fake_labels=use_fake_train_labels)

    # Generate feature specifications now that we
    # know what features are selected
    if select_features_automatically:
        if select_transformations is False:
            feature_specs = generate_default_specs(feature_names)
        else:
            feature_specs = generate_specs_from_data(feature_names,
                                                     'sc1',
                                                     df_train_features,
                                                     feature_subset_specs=feature_subset_specs,
                                                     feature_sign=feature_sign)
    # Sanity check to make sure the function returned the
    # same feature names as specified in feature json file,
    # if there was one
    elif not select_features_automatically:
        assert feature_names == requested_features

    # Do the same for the test data except we can ignore the trim min
    # and max since we already have that from the training data and
    # we have the feature_names when no feature file was specified.
    # We also allow features with 0 standard deviation in the test file.
    test_file_location = locate_file(config_obj['test_file'], configpath)
    if not test_file_location:
        raise FileNotFoundError('Error: Evaluation file '
                                '{} not found.\n'.format(config_obj['test_file']))
    elif (test_file_location == train_file_location
            and train_label_column == test_label_column):
        logging.warning('The same data file and label '
                        'column are used for both training '
                        'and evaluating the model. No second '
                        'score analysis will be performed, even '
                        'if requested.')
        df_test_features = df_train_features.copy()
        df_test_metadata = df_train_metadata.copy()
        df_test_excluded = df_train_excluded.copy()
        df_test_other_columns = df_train_other_columns.copy()
        df_test_flagged_responses = df_train_flagged_responses.copy()
        df_test_human_scores = pd.DataFrame()
    else:
        logger.info('Reading evaluation data: {}'.format(test_file_location))
        (df_test_features,
         df_test_metadata,
         df_test_other_columns,
         df_test_excluded,
         _,
         df_test_human_scores,
         df_test_flagged_responses,
         _, _, _) = load_and_filter_data(test_file_location,
                                         test_label_column,
                                         id_column,
                                         None,
                                         second_human_score_column,
                                         candidate_column,
                                         feature_names,
                                         reserved_column_names,
                                         used_trim_min,
                                         used_trim_max,
                                         flag_column_dict,
                                         subgroups,
                                         exclude_zero_scores=exclude_zero_scores,
                                         exclude_zero_sd=False,
                                         use_fake_labels=use_fake_test_labels)

    return (df_train_features, df_test_features,
            df_train_metadata, df_test_metadata,
            df_train_other_columns, df_test_other_columns,
            df_train_excluded, df_test_excluded,
            df_train_length, df_test_human_scores,
            df_train_flagged_responses,
            df_test_flagged_responses,
            experiment_id, description,
            train_file_location, test_file_location,
            feature_specs, model_name, model_type,
            train_label_column, test_label_column,
            id_column, length_column, second_human_score_column,
            candidate_column,
            subgroups,
            feature_subset_file,
            used_trim_min, used_trim_max,
            use_scaled_predictions, exclude_zero_scores,
            select_features_automatically,
            chosen_notebook_files)