Python _load_featureset Examples, skll.experiments._load_featureset Python Examples

Example #1

0

Show file

def check_load_featureset(suffix, numeric_ids):
    num_feat_files = 5

    # Create test data
    make_merging_data(num_feat_files, suffix, numeric_ids)

    # Load unmerged data and merge it
    dirpath = os.path.join(_my_dir, 'train', 'test_merging')
    featureset = ['{}'.format(i) for i in range(num_feat_files)]
    merged_examples = _load_featureset(dirpath, featureset, suffix, quiet=True)

    # Load pre-merged data
    featureset = ['all']
    premerged_examples = _load_featureset(dirpath,
                                          featureset,
                                          suffix,
                                          quiet=True)

    assert np.all(merged_examples.ids == premerged_examples.ids)
    assert np.all(merged_examples.classes == premerged_examples.classes)
    assert np.all(merged_examples.features.todense() ==
                  premerged_examples.features.todense())
    eq_(merged_examples.feat_vectorizer.feature_names_,
        premerged_examples.feat_vectorizer.feature_names_)
    eq_(merged_examples.feat_vectorizer.vocabulary_,
        premerged_examples.feat_vectorizer.vocabulary_)

Example #2

0

Show file

def test_input_checking1():
    """
    Test merging featuresets with different number of examples
    """
    dirpath = join(_my_dir, 'train')
    suffix = '.jsonlines'
    featureset = ['test_input_2examples_1', 'test_input_3examples_1']
    _load_featureset(dirpath, featureset, suffix, quiet=True)

Example #3

0

Show file

File: test_input.py Project: ChristianGeng/skll

def test_input_checking1():
    """
    Test merging featuresets with different number of examples
    """
    dirpath = join(_my_dir, 'train')
    suffix = '.jsonlines'
    featureset = ['test_input_2examples_1', 'test_input_3examples_1']
    _load_featureset(dirpath, featureset, suffix, quiet=True)

Example #4

0

Show file

def test_input_checking2():
    """
    Test joining featuresets that contain the same features for each instance
    """
    dirpath = join(_my_dir, 'train')
    suffix = '.jsonlines'
    featureset = ['test_input_3examples_1', 'test_input_3examples_1']
    _load_featureset(dirpath, featureset, suffix, quiet=True)

Example #5

0

Show file

File: test_input.py Project: ChristianGeng/skll

def test_input_checking2():
    """
    Test joining featuresets that contain the same features for each instance
    """
    dirpath = join(_my_dir, 'train')
    suffix = '.jsonlines'
    featureset = ['test_input_3examples_1', 'test_input_3examples_1']
    _load_featureset(dirpath, featureset, suffix, quiet=True)

Example #6

0

Show file

def test_input_checking1():
    '''
    Ensure that we raise ValueError when trying to join featuresets with
    different number of examples.
    '''
    dirpath = os.path.join(_my_dir, 'train')
    suffix = '.jsonlines'
    featureset = ['test_input_2examples_1', 'test_input_3examples_1']
    _load_featureset(dirpath, featureset, suffix, quiet=True)

Example #7

0

Show file

def test_input_checking2():
    '''
    Ensure that we raise ValueError when trying to join featuresets
    that contain the same features for each instance.
    '''
    dirpath = os.path.join(_my_dir, 'train')
    suffix = '.jsonlines'
    featureset = ['test_input_3examples_1', 'test_input_3examples_1']
    _load_featureset(dirpath, featureset, suffix, quiet=True)

Example #8

0

Show file

File: test_input.py Project: ChristianGeng/skll

def test_one_file_load_featureset():
    """
    Test loading a single file with _load_featureset
    """
    dirpath = join(_my_dir, 'train')
    suffix = '.jsonlines'
    featureset = ['test_input_2examples_1']
    single_file_fs = _load_featureset(join(dirpath,
                                           'test_input_2examples_1.jsonlines'),
                                      '', '', quiet=True)
    single_fs = _load_featureset(dirpath, featureset, suffix, quiet=True)
    eq_(single_file_fs, single_fs)

Example #9

0

Show file

def test_one_file_load_featureset():
    """
    Test loading a single file with _load_featureset
    """
    dirpath = join(_my_dir, 'train')
    suffix = '.jsonlines'
    featureset = ['test_input_2examples_1']
    single_file_fs = _load_featureset(join(dirpath,
                                           'test_input_2examples_1.jsonlines'),
                                      '',
                                      '',
                                      quiet=True)
    single_fs = _load_featureset(dirpath, featureset, suffix, quiet=True)
    eq_(single_file_fs, single_fs)

Example #10

0

Show file

def check_convert_featureset(from_suffix, to_suffix):
    num_feat_files = 5

    # Create test data
    make_conversion_data(num_feat_files, from_suffix, to_suffix)

    # the path to the unmerged feature files
    dirpath = os.path.join(_my_dir, 'train', 'test_conversion')

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # Load each unmerged feature file in the `from_suffix` format
    # and convert it to the `to_suffix` format
    for feature in range(num_feat_files):
        input_file_path = os.path.join(
            dirpath, '{}_{}{}'.format(feature_name_prefix, feature,
                                      from_suffix))
        output_file_path = os.path.join(
            dirpath, '{}_{}{}'.format(feature_name_prefix, feature, to_suffix))
        skll_convert.main(['--quiet', input_file_path, output_file_path])

    # now load and merge all unmerged, converted features in the `to_suffix` format
    featureset = [
        '{}_{}'.format(feature_name_prefix, i) for i in range(num_feat_files)
    ]
    merged_examples = _load_featureset(dirpath,
                                       featureset,
                                       to_suffix,
                                       quiet=True)

    # Load pre-merged data in the `to_suffix` format
    featureset = ['{}_all'.format(feature_name_prefix)]
    premerged_examples = _load_featureset(dirpath,
                                          featureset,
                                          to_suffix,
                                          quiet=True)

    # make sure that the pre-generated merged data in the to_suffix format
    # is the same as the converted, merged data in the to_suffix format
    assert np.all(merged_examples.ids == premerged_examples.ids)
    assert np.all(merged_examples.classes == premerged_examples.classes)
    assert np.all(merged_examples.features.todense() ==
                  premerged_examples.features.todense())
    eq_(merged_examples.feat_vectorizer.feature_names_,
        premerged_examples.feat_vectorizer.feature_names_)
    eq_(merged_examples.feat_vectorizer.vocabulary_,
        premerged_examples.feat_vectorizer.vocabulary_)

Example #11

0

Show file

def check_specified_cv_folds(numeric_ids):
    make_cv_folds_data(numeric_ids)

    # test_cv_folds1.cfg has prespecified folds and should have ~50% accuracy
    # test_cv_folds2.cfg doesn't have prespecified folds and >95% accuracy
    for experiment_name, test_func, grid_size in [('test_cv_folds1',
                                                   lambda x: x < 0.6,
                                                   3),
                                                  ('test_cv_folds2',
                                                   lambda x: x > 0.95,
                                                   10)]:
        config_template_file = '{}.template.cfg'.format(experiment_name)
        config_template_path = os.path.join(_my_dir, 'configs',
                                            config_template_file)
        config_path = os.path.join(_my_dir,
                                   fill_in_config_paths(config_template_path))

        # Modify config file to change ids_to_floats depending on numeric_ids
        # setting
        with open(config_path, 'r+') as config_template_file:
            lines = config_template_file.readlines()
            config_template_file.seek(0)
            config_template_file.truncate()
            for line in lines:
                if line.startswith('ids_to_floats='):
                    if numeric_ids:
                        line = 'ids_to_floats=true\n'
                    else:
                        line = 'ids_to_floats=false\n'
                config_template_file.write(line)

        run_configuration(config_path, quiet=True)
        result_filename = ('{}_test_cv_folds_LogisticRegression.' +
                           'results').format(experiment_name)
        with open(os.path.join(_my_dir, 'output', result_filename)) as f:
            # check held out scores
            outstr = f.read()
            score = float(SCORE_OUTPUT_RE.search(outstr).groups()[-1])
            assert test_func(score)

            grid_score_matches = GRID_RE.findall(outstr)
            assert len(grid_score_matches) == grid_size
            for match_str in grid_score_matches:
                assert test_func(float(match_str))

    # try the same tests for just training (and specifying the folds for the
    # grid search)
    dirpath = os.path.join(_my_dir, 'train')
    suffix = '.jsonlines'
    featureset = ['test_cv_folds']
    examples = _load_featureset(dirpath, featureset, suffix, quiet=True)
    clf = Learner('LogisticRegression', probability=True)
    cv_folds = _load_cv_folds(os.path.join(_my_dir, 'train',
                                           'test_cv_folds.csv'))
    grid_search_score = clf.train(examples, grid_search_folds=cv_folds,
                                  grid_objective='accuracy', grid_jobs=1)
    assert grid_search_score < 0.6
    grid_search_score = clf.train(examples, grid_search_folds=5,
                                  grid_objective='accuracy', grid_jobs=1)
    assert grid_search_score > 0.95

Example #12

0

Show file

File: test_featureset.py Project: nimmen/skll

def check_convert_featureset(from_suffix, to_suffix):
    num_feat_files = 5

    # Create test data
    make_conversion_data(num_feat_files, from_suffix, to_suffix)

    # the path to the unmerged feature files
    dirpath = join(_my_dir, 'train', 'test_conversion')

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # Load each unmerged feature file in the `from_suffix` format and convert
    # it to the `to_suffix` format
    for feature in range(num_feat_files):
        input_file_path = join(
            dirpath, '{}_{}{}'.format(feature_name_prefix, feature,
                                      from_suffix))
        output_file_path = join(
            dirpath, '{}_{}{}'.format(feature_name_prefix, feature, to_suffix))
        skll_convert.main(['--quiet', input_file_path, output_file_path])

    # now load and merge all unmerged, converted features in the `to_suffix`
    # format
    featureset = [
        '{}_{}'.format(feature_name_prefix, i) for i in range(num_feat_files)
    ]
    merged_exs = _load_featureset(dirpath, featureset, to_suffix, quiet=True)

    # Load pre-merged data in the `to_suffix` format
    featureset = ['{}_all'.format(feature_name_prefix)]
    premerged_exs = _load_featureset(dirpath,
                                     featureset,
                                     to_suffix,
                                     quiet=True)

    # make sure that the pre-generated merged data in the to_suffix format
    # is the same as the converted, merged data in the to_suffix format
    assert_array_equal(merged_exs.ids, premerged_exs.ids)
    assert_array_equal(merged_exs.labels, premerged_exs.labels)
    for (_, _, merged_feats), (_, _,
                               premerged_feats) in zip(merged_exs,
                                                       premerged_exs):
        eq_(merged_feats, premerged_feats)
    eq_(sorted(merged_exs.vectorizer.feature_names_),
        sorted(premerged_exs.vectorizer.feature_names_))

Example #13

0

Show file

File: test_input.py Project: ChristianGeng/skll

def test_input_checking3():
    """
    Test to ensure that we correctly merge featuresets
    """
    dirpath = join(_my_dir, 'train')
    suffix = '.jsonlines'
    featureset = ['test_input_3examples_1', 'test_input_3examples_2']
    examples_tuple = _load_featureset(dirpath, featureset, suffix, quiet=True)
    eq_(examples_tuple.features.shape[0], 3)

Example #14

0

Show file

def test_input_checking3():
    '''
    Small test to ensure that we correctly merge featuresets.
    '''
    dirpath = os.path.join(_my_dir, 'train')
    suffix = '.jsonlines'
    featureset = ['test_input_3examples_1', 'test_input_3examples_2']
    examples_tuple = _load_featureset(dirpath, featureset, suffix, quiet=True)
    assert examples_tuple.features.shape[0] == 3

Example #15

0

Show file

def test_input_checking3():
    """
    Test to ensure that we correctly merge featuresets
    """
    dirpath = join(_my_dir, 'train')
    suffix = '.jsonlines'
    featureset = ['test_input_3examples_1', 'test_input_3examples_2']
    examples_tuple = _load_featureset(dirpath, featureset, suffix, quiet=True)
    eq_(examples_tuple.features.shape[0], 3)

Example #16

0

Show file

File: test_featureset.py Project: BK-University/skll

def check_convert_featureset(from_suffix, to_suffix):
    num_feat_files = 5

    # Create test data
    make_conversion_data(num_feat_files, from_suffix, to_suffix)

    # the path to the unmerged feature files
    dirpath = join(_my_dir, 'train', 'test_conversion')

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # Load each unmerged feature file in the `from_suffix` format and convert
    # it to the `to_suffix` format
    for feature in range(num_feat_files):
        input_file_path = join(dirpath, '{}_{}{}'.format(feature_name_prefix,
                                                         feature,
                                                         from_suffix))
        output_file_path = join(dirpath, '{}_{}{}'.format(feature_name_prefix,
                                                          feature, to_suffix))
        skll_convert.main(['--quiet', input_file_path, output_file_path])

    # now load and merge all unmerged, converted features in the `to_suffix`
    # format
    featureset = ['{}_{}'.format(feature_name_prefix, i) for i in
                  range(num_feat_files)]
    merged_exs = _load_featureset(dirpath, featureset, to_suffix,
                                  quiet=True)

    # Load pre-merged data in the `to_suffix` format
    featureset = ['{}_all'.format(feature_name_prefix)]
    premerged_exs = _load_featureset(dirpath, featureset, to_suffix,
                                     quiet=True)

    # make sure that the pre-generated merged data in the to_suffix format
    # is the same as the converted, merged data in the to_suffix format
    assert_array_equal(merged_exs.ids, premerged_exs.ids)
    assert_array_equal(merged_exs.labels, premerged_exs.labels)
    for (_, _, merged_feats), (_, _, premerged_feats) in zip(merged_exs,
                                                             premerged_exs):
        eq_(merged_feats, premerged_feats)
    eq_(sorted(merged_exs.vectorizer.feature_names_),
        sorted(premerged_exs.vectorizer.feature_names_))

Example #17

0

Show file

File: test_skll.py Project: wavelets/skll

def check_convert_featureset(from_suffix, to_suffix):
    num_feat_files = 5

    # Create test data
    make_conversion_data(num_feat_files, from_suffix, to_suffix)

    # the path to the unmerged feature files
    dirpath = os.path.join(_my_dir, 'train', 'test_conversion')

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.'))

    # Load each unmerged feature file in the `from_suffix` format
    # and convert it to the `to_suffix` format
    for feature in range(num_feat_files):
        input_file_path = os.path.join(dirpath, '{}_{}{}'.format(feature_name_prefix,
                                                                 feature, from_suffix))
        output_file_path = os.path.join(dirpath, '{}_{}{}'.format(feature_name_prefix,
                                                                  feature, to_suffix))
        skll_convert.main(['--quiet', input_file_path, output_file_path])

    # now load and merge all unmerged, converted features in the `to_suffix` format
    featureset = ['{}_{}'.format(feature_name_prefix, i) for i in range(num_feat_files)]
    merged_examples = _load_featureset(dirpath, featureset, to_suffix, quiet=True)

    # Load pre-merged data in the `to_suffix` format
    featureset = ['{}_all'.format(feature_name_prefix)]
    premerged_examples = _load_featureset(dirpath, featureset, to_suffix, quiet=True)

    # make sure that the pre-generated merged data in the to_suffix format
    # is the same as the converted, merged data in the to_suffix format
    assert np.all(merged_examples.ids == premerged_examples.ids)
    assert np.all(merged_examples.classes == premerged_examples.classes)
    assert np.all(merged_examples.features.todense() ==
                  premerged_examples.features.todense())
    eq_(merged_examples.feat_vectorizer.feature_names_,
        premerged_examples.feat_vectorizer.feature_names_)
    eq_(merged_examples.feat_vectorizer.vocabulary_,
        premerged_examples.feat_vectorizer.vocabulary_)

Example #18

0

Show file

File: test_featureset.py Project: BK-University/skll

def check_load_featureset(suffix, numeric_ids):
    num_feat_files = 5

    # Create test data
    make_merging_data(num_feat_files, suffix, numeric_ids)

    # Load unmerged data and merge it
    dirpath = join(_my_dir, 'train', 'test_merging')
    featureset = ['{}'.format(i) for i in range(num_feat_files)]
    merged_exs = _load_featureset(dirpath, featureset, suffix, quiet=True)

    # Load pre-merged data
    featureset = ['all']
    premerged_exs = _load_featureset(dirpath, featureset, suffix,
                                     quiet=True)

    assert_array_equal(merged_exs.ids, premerged_exs.ids)
    assert_array_equal(merged_exs.labels, premerged_exs.labels)
    for (_, _, merged_feats), (_, _, premerged_feats) in zip(merged_exs,
                                                             premerged_exs):
        eq_(merged_feats, premerged_feats)
    eq_(sorted(merged_exs.vectorizer.feature_names_),
        sorted(premerged_exs.vectorizer.feature_names_))

Example #19

0

Show file

def check_load_featureset(suffix, numeric_ids):
    num_feat_files = 5

    # Create test data
    make_merging_data(num_feat_files, suffix, numeric_ids)

    # Load unmerged data and merge it
    dirpath = join(_my_dir, 'train', 'test_merging')
    featureset = ['{}'.format(i) for i in range(num_feat_files)]
    merged_exs = _load_featureset(dirpath, featureset, suffix, quiet=True)

    # Load pre-merged data
    featureset = ['all']
    premerged_exs = _load_featureset(dirpath, featureset, suffix, quiet=True)

    assert_array_equal(merged_exs.ids, premerged_exs.ids)
    assert_array_equal(merged_exs.labels, premerged_exs.labels)
    for (_, _, merged_feats), (_, _,
                               premerged_feats) in zip(merged_exs,
                                                       premerged_exs):
        eq_(merged_feats, premerged_feats)
    eq_(sorted(merged_exs.vectorizer.feature_names_),
        sorted(premerged_exs.vectorizer.feature_names_))

Example #20

0

Show file

File: test_skll.py Project: wavelets/skll

def check_load_featureset(suffix, numeric_ids):
    num_feat_files = 5

    # Create test data
    make_merging_data(num_feat_files, suffix, numeric_ids)

    # Load unmerged data and merge it
    dirpath = os.path.join(_my_dir, 'train', 'test_merging')
    featureset = ['{}'.format(i) for i in range(num_feat_files)]
    merged_examples = _load_featureset(dirpath, featureset, suffix, quiet=True)

    # Load pre-merged data
    featureset = ['all']
    premerged_examples = _load_featureset(dirpath, featureset, suffix,
                                          quiet=True)

    assert np.all(merged_examples.ids == premerged_examples.ids)
    assert np.all(merged_examples.classes == premerged_examples.classes)
    assert np.all(merged_examples.features.todense() ==
                  premerged_examples.features.todense())
    eq_(merged_examples.feat_vectorizer.feature_names_,
        premerged_examples.feat_vectorizer.feature_names_)
    eq_(merged_examples.feat_vectorizer.vocabulary_,
        premerged_examples.feat_vectorizer.vocabulary_)

Example #21

0

Show file

File: test_cv.py Project: monkidea/skll

def test_cross_validate_task():
    """
    Test that 10-fold cross_validate experiments work.
    Test that fold ids get correctly saved.
    """

    # Run experiment
    suffix = '.jsonlines'
    train_path = join(_my_dir, 'train', 'f0{}'.format(suffix))

    config_path = fill_in_config_paths_for_single_file(
        join(_my_dir, "configs", "test_save_cv_folds"
             ".template.cfg"), train_path, None)
    run_configuration(config_path, quiet=True)

    # Check final average results
    with open(
            join(
                _my_dir, 'output', 'test_save_cv_folds_train_f0.' +
                'jsonlines_LogisticRegression.results.json')) as f:
        result_dict = json.load(f)[10]

    assert_almost_equal(result_dict['score'], 0.517)

    # Check that the fold ids were saved correctly
    expected_skll_ids = {}
    examples = _load_featureset(train_path, '', suffix, quiet=True)
    kfold = StratifiedKFold(n_splits=10)
    for fold_num, (_, test_indices) in enumerate(
            kfold.split(examples.features, examples.labels)):
        for index in test_indices:
            expected_skll_ids[examples.ids[index]] = fold_num

    skll_fold_ids = {}
    with open(join(_my_dir, 'output',
                   'test_save_cv_folds_skll_fold_ids.csv')) as f:
        reader = csv.DictReader(f)
        for row in reader:
            skll_fold_ids[row['id']] = row['cv_test_fold']

    # convert the dictionary to strings (sorted by key) for quick comparison
    skll_fold_ids_str = ''.join('{}{}'.format(key, val)
                                for key, val in sorted(skll_fold_ids.items()))
    expected_skll_ids_str = ''.join(
        '{}{}'.format(key, val)
        for key, val in sorted(expected_skll_ids.items()))

    assert_equal(skll_fold_ids_str, expected_skll_ids_str)

Example #22

0

Show file

File: test_cv.py Project: ChristianGeng/skll

def test_cross_validate_task():
    """
    Test that 10-fold cross_validate experiments work.
    Test that fold ids get correctly saved.
    """

    # Run experiment
    suffix = '.jsonlines'
    train_path = join(_my_dir, 'train', 'f0{}'.format(suffix))

    config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs",
                                                            "test_save_cv_folds"
                                                            ".template.cfg"),
                                                       train_path,
                                                       None)
    run_configuration(config_path, quiet=True)

    # Check final average results
    with open(join(_my_dir, 'output', 'test_save_cv_folds_train_f0.' +
                                      'jsonlines_LogisticRegression.results.json')) as f:
        result_dict = json.load(f)[10]

    assert_almost_equal(result_dict['score'], 0.517)

    # Check that the fold ids were saved correctly
    expected_skll_ids = {}
    examples = _load_featureset(train_path, '', suffix, quiet=True)
    kfold = StratifiedKFold(examples.labels, n_folds=10)
    for fold_num, (_, test_indices) in enumerate(kfold):
        for index in test_indices:
            expected_skll_ids[examples.ids[index]] = fold_num

    skll_fold_ids = {}
    with open(join(_my_dir, 'output', 'test_save_cv_folds_skll_fold_ids.csv')) as f:
        reader = csv.DictReader(f)
        for row in reader:
            skll_fold_ids[row['id']] = row['cv_test_fold']

    # convert the dictionary to strings (sorted by key) for quick comparison
    skll_fold_ids_str = ''.join('{}{}'.format(key, val) for key, val in sorted(skll_fold_ids.items()))
    expected_skll_ids_str = ''.join('{}{}'.format(key, val) for key, val in sorted(expected_skll_ids.items()))

    assert_equal(skll_fold_ids_str, expected_skll_ids_str)

Example #23

0

Show file

File: test_featureset.py Project: EducationalTestingService/skll

def check_convert_featureset(from_suffix, to_suffix, with_labels=True):
    num_feat_files = 5

    # Create test data
    make_conversion_data(num_feat_files,
                         from_suffix,
                         to_suffix,
                         with_labels=with_labels)

    # the path to the unmerged feature files
    dirpath = join(_my_dir, 'train', 'test_conversion')

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # use '_unlabeled' as part of any file names when not using labels
    with_labels_part = '' if with_labels else '_unlabeled'

    # Load each unmerged feature file in the `from_suffix` format and convert
    # it to the `to_suffix` format
    for feature in range(num_feat_files):
        input_file_path = join(dirpath, '{}_{}{}{}'.format(feature_name_prefix,
                                                           feature,
                                                           with_labels_part,
                                                           from_suffix))
        output_file_path = join(dirpath, '{}_{}{}{}'.format(feature_name_prefix,
                                                            feature,
                                                            with_labels_part,
                                                            to_suffix))
        skll_convert_args = ['--quiet', input_file_path, output_file_path]
        if not with_labels:
            skll_convert_args.append('--no_labels')
        skll_convert.main(skll_convert_args)

    # now load and merge all unmerged, converted features in the `to_suffix`
    # format
    featureset = ['{}_{}{}'.format(feature_name_prefix, i, with_labels_part) for i in
                  range(num_feat_files)]
    label_col = 'y' if with_labels else None
    merged_exs = _load_featureset(dirpath,
                                  featureset,
                                  to_suffix,
                                  label_col=label_col,
                                  quiet=True)

    # Load pre-merged data in the `to_suffix` format
    featureset = ['{}{}_all'.format(feature_name_prefix, with_labels_part)]
    premerged_exs = _load_featureset(dirpath,
                                     featureset,
                                     to_suffix,
                                     label_col=label_col,
                                     quiet=True)

    # make sure that the pre-generated merged data in the to_suffix format
    # is the same as the converted, merged data in the to_suffix format

    # first check the IDs
    assert_array_equal(merged_exs.ids, premerged_exs.ids)
    assert_array_equal(merged_exs.labels, premerged_exs.labels)
    for (_, _, merged_feats), (_, _, premerged_feats) in zip(merged_exs,
                                                             premerged_exs):
        eq_(merged_feats, premerged_feats)
    eq_(sorted(merged_exs.vectorizer.feature_names_),
        sorted(premerged_exs.vectorizer.feature_names_))

Example #24

0

Show file

def check_convert_featureset(from_suffix, to_suffix, with_labels=True):
    num_feat_files = 5

    # Create test data
    make_conversion_data(num_feat_files,
                         from_suffix,
                         to_suffix,
                         with_labels=with_labels)

    # the path to the unmerged feature files
    dirpath = join(_my_dir, 'train', 'test_conversion')

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # use '_unlabeled' as part of any file names when not using labels
    with_labels_part = '' if with_labels else '_unlabeled'

    # Load each unmerged feature file in the `from_suffix` format and convert
    # it to the `to_suffix` format
    for feature in range(num_feat_files):
        input_file_path = join(
            dirpath, '{}_{}{}{}'.format(feature_name_prefix, feature,
                                        with_labels_part, from_suffix))
        output_file_path = join(
            dirpath, '{}_{}{}{}'.format(feature_name_prefix, feature,
                                        with_labels_part, to_suffix))
        skll_convert_args = ['--quiet', input_file_path, output_file_path]
        if not with_labels:
            skll_convert_args.append('--no_labels')
        skll_convert.main(skll_convert_args)

    # now load and merge all unmerged, converted features in the `to_suffix`
    # format
    featureset = [
        '{}_{}{}'.format(feature_name_prefix, i, with_labels_part)
        for i in range(num_feat_files)
    ]
    label_col = 'y' if with_labels else None
    merged_exs = _load_featureset(dirpath,
                                  featureset,
                                  to_suffix,
                                  label_col=label_col,
                                  quiet=True)

    # Load pre-merged data in the `to_suffix` format
    featureset = ['{}{}_all'.format(feature_name_prefix, with_labels_part)]
    premerged_exs = _load_featureset(dirpath,
                                     featureset,
                                     to_suffix,
                                     label_col=label_col,
                                     quiet=True)

    # make sure that the pre-generated merged data in the to_suffix format
    # is the same as the converted, merged data in the to_suffix format

    # first check the IDs
    assert_array_equal(merged_exs.ids, premerged_exs.ids)
    assert_array_equal(merged_exs.labels, premerged_exs.labels)
    for (_, _, merged_feats), (_, _,
                               premerged_feats) in zip(merged_exs,
                                                       premerged_exs):
        eq_(merged_feats, premerged_feats)
    eq_(sorted(merged_exs.vectorizer.feature_names_),
        sorted(premerged_exs.vectorizer.feature_names_))