Example #1
0
def test_skll_convert_libsvm_map():
    """
    Test to check whether the --reuse_libsvm_map option works for skll_convert
    """

    # create some simple classification data
    orig_fs, _ = make_classification_data(train_test_ratio=1.0,
                                          one_string_feature=True)

    # now write out this feature set as a libsvm file
    orig_libsvm_file = join(_my_dir, 'other',
                            'test_skll_convert_libsvm_map.libsvm')
    writer = LibSVMWriter(orig_libsvm_file, orig_fs, quiet=True)
    writer.write()

    # now make a copy of the dataset
    swapped_fs = copy.deepcopy(orig_fs)

    # now modify this new featureset to swap the first two columns
    del swapped_fs.vectorizer.vocabulary_['f01']
    del swapped_fs.vectorizer.vocabulary_['f02']
    swapped_fs.vectorizer.vocabulary_['f01'] = 1
    swapped_fs.vectorizer.vocabulary_['f02'] = 0
    tmp = swapped_fs.features[:, 0]
    swapped_fs.features[:, 0] = swapped_fs.features[:, 1]
    swapped_fs.features[:, 1] = tmp

    # now write out this new feature set as a MegaM file
    swapped_megam_file = join(_my_dir, 'other',
                              'test_skll_convert_libsvm_map.megam')
    writer = MegaMWriter(swapped_megam_file, swapped_fs, quiet=True)
    writer.write()

    # now run skll_convert to convert this into a libsvm file
    # but using the mapping specified in the first libsvm file
    converted_libsvm_file = join(_my_dir, 'other',
                                 'test_skll_convert_libsvm_map2.libsvm')

    # now call skll convert's main function
    skll_convert_cmd = [
        '--reuse_libsvm_map', orig_libsvm_file, '--quiet', orig_libsvm_file,
        converted_libsvm_file
    ]
    err = ''
    try:
        old_stderr = sys.stderr
        sys.stderr = mystderr = StringIO()
        sk.main(skll_convert_cmd)
        err = mystderr.getvalue()
    finally:
        sys.stderr = old_stderr
        print(err)

    # now read the converted libsvm file into a featureset
    reader = LibSVMReader(converted_libsvm_file, quiet=True)
    converted_fs = reader.read()

    # now ensure that this new featureset and the original
    # featureset are the same
    eq_(orig_fs, converted_fs)
Example #2
0
def test_skll_convert_libsvm_map():
    """
    Test to check whether the --reuse_libsvm_map option works for skll_convert
    """

    # create some simple classification data
    orig_fs, _ = make_classification_data(train_test_ratio=1.0,
                                          one_string_feature=True)

    # now write out this feature set as a libsvm file
    orig_libsvm_file = join(_my_dir, 'other',
                            'test_skll_convert_libsvm_map.libsvm')
    writer = LibSVMWriter(orig_libsvm_file, orig_fs, quiet=True)
    writer.write()

    # now make a copy of the dataset
    swapped_fs = copy.deepcopy(orig_fs)

    # now modify this new featureset to swap the first two columns
    del swapped_fs.vectorizer.vocabulary_['f01']
    del swapped_fs.vectorizer.vocabulary_['f02']
    swapped_fs.vectorizer.vocabulary_['f01'] = 1
    swapped_fs.vectorizer.vocabulary_['f02'] = 0
    tmp = swapped_fs.features[:, 0]
    swapped_fs.features[:, 0] = swapped_fs.features[:, 1]
    swapped_fs.features[:, 1] = tmp

    # now write out this new feature set as a MegaM file
    swapped_megam_file = join(_my_dir, 'other',
                              'test_skll_convert_libsvm_map.megam')
    writer = MegaMWriter(swapped_megam_file, swapped_fs, quiet=True)
    writer.write()

    # now run skll_convert to convert this into a libsvm file
    # but using the mapping specified in the first libsvm file
    converted_libsvm_file = join(_my_dir, 'other',
                                 'test_skll_convert_libsvm_map2.libsvm')

    # now call skll convert's main function
    skll_convert_cmd = ['--reuse_libsvm_map', orig_libsvm_file,
                        '--quiet', orig_libsvm_file,
                        converted_libsvm_file]
    err = ''
    try:
        old_stderr = sys.stderr
        sys.stderr = mystderr = StringIO()
        sk.main(skll_convert_cmd)
        err = mystderr.getvalue()
    finally:
        sys.stderr = old_stderr
        print(err)

    # now read the converted libsvm file into a featureset
    reader = LibSVMReader(converted_libsvm_file, quiet=True)
    converted_fs = reader.read()

    # now ensure that this new featureset and the original
    # featureset are the same
    eq_(orig_fs, converted_fs)
Example #3
0
def check_convert_featureset(from_suffix, to_suffix):
    num_feat_files = 5

    # Create test data
    make_conversion_data(num_feat_files, from_suffix, to_suffix)

    # the path to the unmerged feature files
    dirpath = os.path.join(_my_dir, 'train', 'test_conversion')

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # Load each unmerged feature file in the `from_suffix` format
    # and convert it to the `to_suffix` format
    for feature in range(num_feat_files):
        input_file_path = os.path.join(
            dirpath, '{}_{}{}'.format(feature_name_prefix, feature,
                                      from_suffix))
        output_file_path = os.path.join(
            dirpath, '{}_{}{}'.format(feature_name_prefix, feature, to_suffix))
        skll_convert.main(['--quiet', input_file_path, output_file_path])

    # now load and merge all unmerged, converted features in the `to_suffix` format
    featureset = [
        '{}_{}'.format(feature_name_prefix, i) for i in range(num_feat_files)
    ]
    merged_examples = _load_featureset(dirpath,
                                       featureset,
                                       to_suffix,
                                       quiet=True)

    # Load pre-merged data in the `to_suffix` format
    featureset = ['{}_all'.format(feature_name_prefix)]
    premerged_examples = _load_featureset(dirpath,
                                          featureset,
                                          to_suffix,
                                          quiet=True)

    # make sure that the pre-generated merged data in the to_suffix format
    # is the same as the converted, merged data in the to_suffix format
    assert np.all(merged_examples.ids == premerged_examples.ids)
    assert np.all(merged_examples.classes == premerged_examples.classes)
    assert np.all(merged_examples.features.todense() ==
                  premerged_examples.features.todense())
    eq_(merged_examples.feat_vectorizer.feature_names_,
        premerged_examples.feat_vectorizer.feature_names_)
    eq_(merged_examples.feat_vectorizer.vocabulary_,
        premerged_examples.feat_vectorizer.vocabulary_)
Example #4
0
def check_convert_featureset(from_suffix, to_suffix):
    num_feat_files = 5

    # Create test data
    make_conversion_data(num_feat_files, from_suffix, to_suffix)

    # the path to the unmerged feature files
    dirpath = join(_my_dir, 'train', 'test_conversion')

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # Load each unmerged feature file in the `from_suffix` format and convert
    # it to the `to_suffix` format
    for feature in range(num_feat_files):
        input_file_path = join(
            dirpath, '{}_{}{}'.format(feature_name_prefix, feature,
                                      from_suffix))
        output_file_path = join(
            dirpath, '{}_{}{}'.format(feature_name_prefix, feature, to_suffix))
        skll_convert.main(['--quiet', input_file_path, output_file_path])

    # now load and merge all unmerged, converted features in the `to_suffix`
    # format
    featureset = [
        '{}_{}'.format(feature_name_prefix, i) for i in range(num_feat_files)
    ]
    merged_exs = _load_featureset(dirpath, featureset, to_suffix, quiet=True)

    # Load pre-merged data in the `to_suffix` format
    featureset = ['{}_all'.format(feature_name_prefix)]
    premerged_exs = _load_featureset(dirpath,
                                     featureset,
                                     to_suffix,
                                     quiet=True)

    # make sure that the pre-generated merged data in the to_suffix format
    # is the same as the converted, merged data in the to_suffix format
    assert_array_equal(merged_exs.ids, premerged_exs.ids)
    assert_array_equal(merged_exs.labels, premerged_exs.labels)
    for (_, _, merged_feats), (_, _,
                               premerged_feats) in zip(merged_exs,
                                                       premerged_exs):
        eq_(merged_feats, premerged_feats)
    eq_(sorted(merged_exs.vectorizer.feature_names_),
        sorted(premerged_exs.vectorizer.feature_names_))
Example #5
0
def check_convert_featureset(from_suffix, to_suffix):
    num_feat_files = 5

    # Create test data
    make_conversion_data(num_feat_files, from_suffix, to_suffix)

    # the path to the unmerged feature files
    dirpath = join(_my_dir, 'train', 'test_conversion')

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # Load each unmerged feature file in the `from_suffix` format and convert
    # it to the `to_suffix` format
    for feature in range(num_feat_files):
        input_file_path = join(dirpath, '{}_{}{}'.format(feature_name_prefix,
                                                         feature,
                                                         from_suffix))
        output_file_path = join(dirpath, '{}_{}{}'.format(feature_name_prefix,
                                                          feature, to_suffix))
        skll_convert.main(['--quiet', input_file_path, output_file_path])

    # now load and merge all unmerged, converted features in the `to_suffix`
    # format
    featureset = ['{}_{}'.format(feature_name_prefix, i) for i in
                  range(num_feat_files)]
    merged_exs = _load_featureset(dirpath, featureset, to_suffix,
                                  quiet=True)

    # Load pre-merged data in the `to_suffix` format
    featureset = ['{}_all'.format(feature_name_prefix)]
    premerged_exs = _load_featureset(dirpath, featureset, to_suffix,
                                     quiet=True)

    # make sure that the pre-generated merged data in the to_suffix format
    # is the same as the converted, merged data in the to_suffix format
    assert_array_equal(merged_exs.ids, premerged_exs.ids)
    assert_array_equal(merged_exs.labels, premerged_exs.labels)
    for (_, _, merged_feats), (_, _, premerged_feats) in zip(merged_exs,
                                                             premerged_exs):
        eq_(merged_feats, premerged_feats)
    eq_(sorted(merged_exs.vectorizer.feature_names_),
        sorted(premerged_exs.vectorizer.feature_names_))
Example #6
0
def check_convert_featureset(from_suffix, to_suffix):
    num_feat_files = 5

    # Create test data
    make_conversion_data(num_feat_files, from_suffix, to_suffix)

    # the path to the unmerged feature files
    dirpath = os.path.join(_my_dir, 'train', 'test_conversion')

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.'))

    # Load each unmerged feature file in the `from_suffix` format
    # and convert it to the `to_suffix` format
    for feature in range(num_feat_files):
        input_file_path = os.path.join(dirpath, '{}_{}{}'.format(feature_name_prefix,
                                                                 feature, from_suffix))
        output_file_path = os.path.join(dirpath, '{}_{}{}'.format(feature_name_prefix,
                                                                  feature, to_suffix))
        skll_convert.main(['--quiet', input_file_path, output_file_path])

    # now load and merge all unmerged, converted features in the `to_suffix` format
    featureset = ['{}_{}'.format(feature_name_prefix, i) for i in range(num_feat_files)]
    merged_examples = _load_featureset(dirpath, featureset, to_suffix, quiet=True)

    # Load pre-merged data in the `to_suffix` format
    featureset = ['{}_all'.format(feature_name_prefix)]
    premerged_examples = _load_featureset(dirpath, featureset, to_suffix, quiet=True)

    # make sure that the pre-generated merged data in the to_suffix format
    # is the same as the converted, merged data in the to_suffix format
    assert np.all(merged_examples.ids == premerged_examples.ids)
    assert np.all(merged_examples.classes == premerged_examples.classes)
    assert np.all(merged_examples.features.todense() ==
                  premerged_examples.features.todense())
    eq_(merged_examples.feat_vectorizer.feature_names_,
        premerged_examples.feat_vectorizer.feature_names_)
    eq_(merged_examples.feat_vectorizer.vocabulary_,
        premerged_examples.feat_vectorizer.vocabulary_)
Example #7
0
def check_skll_convert(from_suffix, to_suffix):

    # create some simple classification data
    orig_fs, _ = make_classification_data(train_test_ratio=1.0,
                                          one_string_feature=True)

    # now write out this feature set in the given suffix
    from_suffix_file = join(_my_dir, 'other',
                            'test_skll_convert_in{}'.format(from_suffix))
    to_suffix_file = join(_my_dir, 'other',
                          'test_skll_convert_out{}'.format(to_suffix))

    writer = EXT_TO_WRITER[from_suffix](from_suffix_file, orig_fs, quiet=True)
    writer.write()

    # now run skll convert to convert the featureset into the other format
    skll_convert_cmd = [from_suffix_file, to_suffix_file, '--quiet']

    # we need to capture stderr to make sure we don't miss any errors
    err = ''
    try:
        old_stderr = sys.stderr
        sys.stderr = mystderr = StringIO()
        sk.main(skll_convert_cmd)
        err = mystderr.getvalue()
    finally:
        sys.stderr = old_stderr
        print(err)

    # now read the converted file
    reader = EXT_TO_READER[to_suffix](to_suffix_file, quiet=True)
    converted_fs = reader.read()

    # ensure that the original and the converted feature sets
    # are the same
    eq_(orig_fs, converted_fs)
Example #8
0
def check_convert_featureset(from_suffix, to_suffix, with_labels=True):
    num_feat_files = 5

    # Create test data
    make_conversion_data(num_feat_files,
                         from_suffix,
                         to_suffix,
                         with_labels=with_labels)

    # the path to the unmerged feature files
    dirpath = join(_my_dir, 'train', 'test_conversion')

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # use '_unlabeled' as part of any file names when not using labels
    with_labels_part = '' if with_labels else '_unlabeled'

    # Load each unmerged feature file in the `from_suffix` format and convert
    # it to the `to_suffix` format
    for feature in range(num_feat_files):
        input_file_path = join(
            dirpath, '{}_{}{}{}'.format(feature_name_prefix, feature,
                                        with_labels_part, from_suffix))
        output_file_path = join(
            dirpath, '{}_{}{}{}'.format(feature_name_prefix, feature,
                                        with_labels_part, to_suffix))
        skll_convert_args = ['--quiet', input_file_path, output_file_path]
        if not with_labels:
            skll_convert_args.append('--no_labels')
        skll_convert.main(skll_convert_args)

    # now load and merge all unmerged, converted features in the `to_suffix`
    # format
    featureset = [
        '{}_{}{}'.format(feature_name_prefix, i, with_labels_part)
        for i in range(num_feat_files)
    ]
    label_col = 'y' if with_labels else None
    merged_exs = _load_featureset(dirpath,
                                  featureset,
                                  to_suffix,
                                  label_col=label_col,
                                  quiet=True)

    # Load pre-merged data in the `to_suffix` format
    featureset = ['{}{}_all'.format(feature_name_prefix, with_labels_part)]
    premerged_exs = _load_featureset(dirpath,
                                     featureset,
                                     to_suffix,
                                     label_col=label_col,
                                     quiet=True)

    # make sure that the pre-generated merged data in the to_suffix format
    # is the same as the converted, merged data in the to_suffix format

    # first check the IDs
    assert_array_equal(merged_exs.ids, premerged_exs.ids)
    assert_array_equal(merged_exs.labels, premerged_exs.labels)
    for (_, _, merged_feats), (_, _,
                               premerged_feats) in zip(merged_exs,
                                                       premerged_exs):
        eq_(merged_feats, premerged_feats)
    eq_(sorted(merged_exs.vectorizer.feature_names_),
        sorted(premerged_exs.vectorizer.feature_names_))
def check_convert_featureset(from_suffix, to_suffix, with_labels=True):
    num_feat_files = 5

    # Create test data
    make_conversion_data(num_feat_files,
                         from_suffix,
                         to_suffix,
                         with_labels=with_labels)

    # the path to the unmerged feature files
    dirpath = join(_my_dir, 'train', 'test_conversion')

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # use '_unlabeled' as part of any file names when not using labels
    with_labels_part = '' if with_labels else '_unlabeled'

    # Load each unmerged feature file in the `from_suffix` format and convert
    # it to the `to_suffix` format
    for feature in range(num_feat_files):
        input_file_path = join(dirpath, '{}_{}{}{}'.format(feature_name_prefix,
                                                           feature,
                                                           with_labels_part,
                                                           from_suffix))
        output_file_path = join(dirpath, '{}_{}{}{}'.format(feature_name_prefix,
                                                            feature,
                                                            with_labels_part,
                                                            to_suffix))
        skll_convert_args = ['--quiet', input_file_path, output_file_path]
        if not with_labels:
            skll_convert_args.append('--no_labels')
        skll_convert.main(skll_convert_args)

    # now load and merge all unmerged, converted features in the `to_suffix`
    # format
    featureset = ['{}_{}{}'.format(feature_name_prefix, i, with_labels_part) for i in
                  range(num_feat_files)]
    label_col = 'y' if with_labels else None
    merged_exs = _load_featureset(dirpath,
                                  featureset,
                                  to_suffix,
                                  label_col=label_col,
                                  quiet=True)

    # Load pre-merged data in the `to_suffix` format
    featureset = ['{}{}_all'.format(feature_name_prefix, with_labels_part)]
    premerged_exs = _load_featureset(dirpath,
                                     featureset,
                                     to_suffix,
                                     label_col=label_col,
                                     quiet=True)

    # make sure that the pre-generated merged data in the to_suffix format
    # is the same as the converted, merged data in the to_suffix format

    # first check the IDs
    assert_array_equal(merged_exs.ids, premerged_exs.ids)
    assert_array_equal(merged_exs.labels, premerged_exs.labels)
    for (_, _, merged_feats), (_, _, premerged_feats) in zip(merged_exs,
                                                             premerged_exs):
        eq_(merged_feats, premerged_feats)
    eq_(sorted(merged_exs.vectorizer.feature_names_),
        sorted(premerged_exs.vectorizer.feature_names_))