def test_skll_convert_libsvm_map(): """ Test to check whether the --reuse_libsvm_map option works for skll_convert """ # create some simple classification data orig_fs, _ = make_classification_data(train_test_ratio=1.0, one_string_feature=True) # now write out this feature set as a libsvm file orig_libsvm_file = join(_my_dir, 'other', 'test_skll_convert_libsvm_map.libsvm') writer = LibSVMWriter(orig_libsvm_file, orig_fs, quiet=True) writer.write() # now make a copy of the dataset swapped_fs = copy.deepcopy(orig_fs) # now modify this new featureset to swap the first two columns del swapped_fs.vectorizer.vocabulary_['f01'] del swapped_fs.vectorizer.vocabulary_['f02'] swapped_fs.vectorizer.vocabulary_['f01'] = 1 swapped_fs.vectorizer.vocabulary_['f02'] = 0 tmp = swapped_fs.features[:, 0] swapped_fs.features[:, 0] = swapped_fs.features[:, 1] swapped_fs.features[:, 1] = tmp # now write out this new feature set as a MegaM file swapped_megam_file = join(_my_dir, 'other', 'test_skll_convert_libsvm_map.megam') writer = MegaMWriter(swapped_megam_file, swapped_fs, quiet=True) writer.write() # now run skll_convert to convert this into a libsvm file # but using the mapping specified in the first libsvm file converted_libsvm_file = join(_my_dir, 'other', 'test_skll_convert_libsvm_map2.libsvm') # now call skll convert's main function skll_convert_cmd = [ '--reuse_libsvm_map', orig_libsvm_file, '--quiet', orig_libsvm_file, converted_libsvm_file ] err = '' try: old_stderr = sys.stderr sys.stderr = mystderr = StringIO() sk.main(skll_convert_cmd) err = mystderr.getvalue() finally: sys.stderr = old_stderr print(err) # now read the converted libsvm file into a featureset reader = LibSVMReader(converted_libsvm_file, quiet=True) converted_fs = reader.read() # now ensure that this new featureset and the original # featureset are the same eq_(orig_fs, converted_fs)
def test_skll_convert_libsvm_map(): """ Test to check whether the --reuse_libsvm_map option works for skll_convert """ # create some simple classification data orig_fs, _ = make_classification_data(train_test_ratio=1.0, one_string_feature=True) # now write out this feature set as a libsvm file orig_libsvm_file = join(_my_dir, 'other', 'test_skll_convert_libsvm_map.libsvm') writer = LibSVMWriter(orig_libsvm_file, orig_fs, quiet=True) writer.write() # now make a copy of the dataset swapped_fs = copy.deepcopy(orig_fs) # now modify this new featureset to swap the first two columns del swapped_fs.vectorizer.vocabulary_['f01'] del swapped_fs.vectorizer.vocabulary_['f02'] swapped_fs.vectorizer.vocabulary_['f01'] = 1 swapped_fs.vectorizer.vocabulary_['f02'] = 0 tmp = swapped_fs.features[:, 0] swapped_fs.features[:, 0] = swapped_fs.features[:, 1] swapped_fs.features[:, 1] = tmp # now write out this new feature set as a MegaM file swapped_megam_file = join(_my_dir, 'other', 'test_skll_convert_libsvm_map.megam') writer = MegaMWriter(swapped_megam_file, swapped_fs, quiet=True) writer.write() # now run skll_convert to convert this into a libsvm file # but using the mapping specified in the first libsvm file converted_libsvm_file = join(_my_dir, 'other', 'test_skll_convert_libsvm_map2.libsvm') # now call skll convert's main function skll_convert_cmd = ['--reuse_libsvm_map', orig_libsvm_file, '--quiet', orig_libsvm_file, converted_libsvm_file] err = '' try: old_stderr = sys.stderr sys.stderr = mystderr = StringIO() sk.main(skll_convert_cmd) err = mystderr.getvalue() finally: sys.stderr = old_stderr print(err) # now read the converted libsvm file into a featureset reader = LibSVMReader(converted_libsvm_file, quiet=True) converted_fs = reader.read() # now ensure that this new featureset and the original # featureset are the same eq_(orig_fs, converted_fs)
def check_convert_featureset(from_suffix, to_suffix): num_feat_files = 5 # Create test data make_conversion_data(num_feat_files, from_suffix, to_suffix) # the path to the unmerged feature files dirpath = os.path.join(_my_dir, 'train', 'test_conversion') # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # Load each unmerged feature file in the `from_suffix` format # and convert it to the `to_suffix` format for feature in range(num_feat_files): input_file_path = os.path.join( dirpath, '{}_{}{}'.format(feature_name_prefix, feature, from_suffix)) output_file_path = os.path.join( dirpath, '{}_{}{}'.format(feature_name_prefix, feature, to_suffix)) skll_convert.main(['--quiet', input_file_path, output_file_path]) # now load and merge all unmerged, converted features in the `to_suffix` format featureset = [ '{}_{}'.format(feature_name_prefix, i) for i in range(num_feat_files) ] merged_examples = _load_featureset(dirpath, featureset, to_suffix, quiet=True) # Load pre-merged data in the `to_suffix` format featureset = ['{}_all'.format(feature_name_prefix)] premerged_examples = _load_featureset(dirpath, featureset, to_suffix, quiet=True) # make sure that the pre-generated merged data in the to_suffix format # is the same as the converted, merged data in the to_suffix format assert np.all(merged_examples.ids == premerged_examples.ids) assert np.all(merged_examples.classes == premerged_examples.classes) assert np.all(merged_examples.features.todense() == premerged_examples.features.todense()) eq_(merged_examples.feat_vectorizer.feature_names_, premerged_examples.feat_vectorizer.feature_names_) eq_(merged_examples.feat_vectorizer.vocabulary_, premerged_examples.feat_vectorizer.vocabulary_)
def check_convert_featureset(from_suffix, to_suffix): num_feat_files = 5 # Create test data make_conversion_data(num_feat_files, from_suffix, to_suffix) # the path to the unmerged feature files dirpath = join(_my_dir, 'train', 'test_conversion') # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # Load each unmerged feature file in the `from_suffix` format and convert # it to the `to_suffix` format for feature in range(num_feat_files): input_file_path = join( dirpath, '{}_{}{}'.format(feature_name_prefix, feature, from_suffix)) output_file_path = join( dirpath, '{}_{}{}'.format(feature_name_prefix, feature, to_suffix)) skll_convert.main(['--quiet', input_file_path, output_file_path]) # now load and merge all unmerged, converted features in the `to_suffix` # format featureset = [ '{}_{}'.format(feature_name_prefix, i) for i in range(num_feat_files) ] merged_exs = _load_featureset(dirpath, featureset, to_suffix, quiet=True) # Load pre-merged data in the `to_suffix` format featureset = ['{}_all'.format(feature_name_prefix)] premerged_exs = _load_featureset(dirpath, featureset, to_suffix, quiet=True) # make sure that the pre-generated merged data in the to_suffix format # is the same as the converted, merged data in the to_suffix format assert_array_equal(merged_exs.ids, premerged_exs.ids) assert_array_equal(merged_exs.labels, premerged_exs.labels) for (_, _, merged_feats), (_, _, premerged_feats) in zip(merged_exs, premerged_exs): eq_(merged_feats, premerged_feats) eq_(sorted(merged_exs.vectorizer.feature_names_), sorted(premerged_exs.vectorizer.feature_names_))
def check_convert_featureset(from_suffix, to_suffix): num_feat_files = 5 # Create test data make_conversion_data(num_feat_files, from_suffix, to_suffix) # the path to the unmerged feature files dirpath = join(_my_dir, 'train', 'test_conversion') # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # Load each unmerged feature file in the `from_suffix` format and convert # it to the `to_suffix` format for feature in range(num_feat_files): input_file_path = join(dirpath, '{}_{}{}'.format(feature_name_prefix, feature, from_suffix)) output_file_path = join(dirpath, '{}_{}{}'.format(feature_name_prefix, feature, to_suffix)) skll_convert.main(['--quiet', input_file_path, output_file_path]) # now load and merge all unmerged, converted features in the `to_suffix` # format featureset = ['{}_{}'.format(feature_name_prefix, i) for i in range(num_feat_files)] merged_exs = _load_featureset(dirpath, featureset, to_suffix, quiet=True) # Load pre-merged data in the `to_suffix` format featureset = ['{}_all'.format(feature_name_prefix)] premerged_exs = _load_featureset(dirpath, featureset, to_suffix, quiet=True) # make sure that the pre-generated merged data in the to_suffix format # is the same as the converted, merged data in the to_suffix format assert_array_equal(merged_exs.ids, premerged_exs.ids) assert_array_equal(merged_exs.labels, premerged_exs.labels) for (_, _, merged_feats), (_, _, premerged_feats) in zip(merged_exs, premerged_exs): eq_(merged_feats, premerged_feats) eq_(sorted(merged_exs.vectorizer.feature_names_), sorted(premerged_exs.vectorizer.feature_names_))
def check_convert_featureset(from_suffix, to_suffix): num_feat_files = 5 # Create test data make_conversion_data(num_feat_files, from_suffix, to_suffix) # the path to the unmerged feature files dirpath = os.path.join(_my_dir, 'train', 'test_conversion') # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # Load each unmerged feature file in the `from_suffix` format # and convert it to the `to_suffix` format for feature in range(num_feat_files): input_file_path = os.path.join(dirpath, '{}_{}{}'.format(feature_name_prefix, feature, from_suffix)) output_file_path = os.path.join(dirpath, '{}_{}{}'.format(feature_name_prefix, feature, to_suffix)) skll_convert.main(['--quiet', input_file_path, output_file_path]) # now load and merge all unmerged, converted features in the `to_suffix` format featureset = ['{}_{}'.format(feature_name_prefix, i) for i in range(num_feat_files)] merged_examples = _load_featureset(dirpath, featureset, to_suffix, quiet=True) # Load pre-merged data in the `to_suffix` format featureset = ['{}_all'.format(feature_name_prefix)] premerged_examples = _load_featureset(dirpath, featureset, to_suffix, quiet=True) # make sure that the pre-generated merged data in the to_suffix format # is the same as the converted, merged data in the to_suffix format assert np.all(merged_examples.ids == premerged_examples.ids) assert np.all(merged_examples.classes == premerged_examples.classes) assert np.all(merged_examples.features.todense() == premerged_examples.features.todense()) eq_(merged_examples.feat_vectorizer.feature_names_, premerged_examples.feat_vectorizer.feature_names_) eq_(merged_examples.feat_vectorizer.vocabulary_, premerged_examples.feat_vectorizer.vocabulary_)
def check_skll_convert(from_suffix, to_suffix): # create some simple classification data orig_fs, _ = make_classification_data(train_test_ratio=1.0, one_string_feature=True) # now write out this feature set in the given suffix from_suffix_file = join(_my_dir, 'other', 'test_skll_convert_in{}'.format(from_suffix)) to_suffix_file = join(_my_dir, 'other', 'test_skll_convert_out{}'.format(to_suffix)) writer = EXT_TO_WRITER[from_suffix](from_suffix_file, orig_fs, quiet=True) writer.write() # now run skll convert to convert the featureset into the other format skll_convert_cmd = [from_suffix_file, to_suffix_file, '--quiet'] # we need to capture stderr to make sure we don't miss any errors err = '' try: old_stderr = sys.stderr sys.stderr = mystderr = StringIO() sk.main(skll_convert_cmd) err = mystderr.getvalue() finally: sys.stderr = old_stderr print(err) # now read the converted file reader = EXT_TO_READER[to_suffix](to_suffix_file, quiet=True) converted_fs = reader.read() # ensure that the original and the converted feature sets # are the same eq_(orig_fs, converted_fs)
def check_convert_featureset(from_suffix, to_suffix, with_labels=True): num_feat_files = 5 # Create test data make_conversion_data(num_feat_files, from_suffix, to_suffix, with_labels=with_labels) # the path to the unmerged feature files dirpath = join(_my_dir, 'train', 'test_conversion') # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # use '_unlabeled' as part of any file names when not using labels with_labels_part = '' if with_labels else '_unlabeled' # Load each unmerged feature file in the `from_suffix` format and convert # it to the `to_suffix` format for feature in range(num_feat_files): input_file_path = join( dirpath, '{}_{}{}{}'.format(feature_name_prefix, feature, with_labels_part, from_suffix)) output_file_path = join( dirpath, '{}_{}{}{}'.format(feature_name_prefix, feature, with_labels_part, to_suffix)) skll_convert_args = ['--quiet', input_file_path, output_file_path] if not with_labels: skll_convert_args.append('--no_labels') skll_convert.main(skll_convert_args) # now load and merge all unmerged, converted features in the `to_suffix` # format featureset = [ '{}_{}{}'.format(feature_name_prefix, i, with_labels_part) for i in range(num_feat_files) ] label_col = 'y' if with_labels else None merged_exs = _load_featureset(dirpath, featureset, to_suffix, label_col=label_col, quiet=True) # Load pre-merged data in the `to_suffix` format featureset = ['{}{}_all'.format(feature_name_prefix, with_labels_part)] premerged_exs = _load_featureset(dirpath, featureset, to_suffix, label_col=label_col, quiet=True) # make sure that the pre-generated merged data in the to_suffix format # is the same as the converted, merged data in the to_suffix format # first check the IDs assert_array_equal(merged_exs.ids, premerged_exs.ids) assert_array_equal(merged_exs.labels, premerged_exs.labels) for (_, _, merged_feats), (_, _, premerged_feats) in zip(merged_exs, premerged_exs): eq_(merged_feats, premerged_feats) eq_(sorted(merged_exs.vectorizer.feature_names_), sorted(premerged_exs.vectorizer.feature_names_))
def check_convert_featureset(from_suffix, to_suffix, with_labels=True): num_feat_files = 5 # Create test data make_conversion_data(num_feat_files, from_suffix, to_suffix, with_labels=with_labels) # the path to the unmerged feature files dirpath = join(_my_dir, 'train', 'test_conversion') # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # use '_unlabeled' as part of any file names when not using labels with_labels_part = '' if with_labels else '_unlabeled' # Load each unmerged feature file in the `from_suffix` format and convert # it to the `to_suffix` format for feature in range(num_feat_files): input_file_path = join(dirpath, '{}_{}{}{}'.format(feature_name_prefix, feature, with_labels_part, from_suffix)) output_file_path = join(dirpath, '{}_{}{}{}'.format(feature_name_prefix, feature, with_labels_part, to_suffix)) skll_convert_args = ['--quiet', input_file_path, output_file_path] if not with_labels: skll_convert_args.append('--no_labels') skll_convert.main(skll_convert_args) # now load and merge all unmerged, converted features in the `to_suffix` # format featureset = ['{}_{}{}'.format(feature_name_prefix, i, with_labels_part) for i in range(num_feat_files)] label_col = 'y' if with_labels else None merged_exs = _load_featureset(dirpath, featureset, to_suffix, label_col=label_col, quiet=True) # Load pre-merged data in the `to_suffix` format featureset = ['{}{}_all'.format(feature_name_prefix, with_labels_part)] premerged_exs = _load_featureset(dirpath, featureset, to_suffix, label_col=label_col, quiet=True) # make sure that the pre-generated merged data in the to_suffix format # is the same as the converted, merged data in the to_suffix format # first check the IDs assert_array_equal(merged_exs.ids, premerged_exs.ids) assert_array_equal(merged_exs.labels, premerged_exs.labels) for (_, _, merged_feats), (_, _, premerged_feats) in zip(merged_exs, premerged_exs): eq_(merged_feats, premerged_feats) eq_(sorted(merged_exs.vectorizer.feature_names_), sorted(premerged_exs.vectorizer.feature_names_))