コード例 #1
0
ファイル: test_output.py プロジェクト: BenJamesbabala/skll
def make_learning_curve_data():

    # Load in the digits data set
    digits = load_digits()
    X, y = digits.data, digits.target

    # create featureset with all features
    feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))
    fs1 = FeatureSet('train1',
                     features=features,
                     labels=y,
                     ids=list(range(X.shape[0])))

    # Write this feature set to file
    train_path = join(_my_dir, 'train', 'test_learning_curve1.jsonlines')
    writer = NDJWriter(train_path, fs1)
    writer.write()

    # create featureset with all except the last feature
    feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])]
    features = []
    for row in X:
        features.append(dict(zip(feature_names[:-1], row)))
    fs2 = FeatureSet('train2',
                     features=features,
                     labels=y,
                     ids=list(range(X.shape[0])))

    # Write this feature set to file
    train_path = join(_my_dir, 'train', 'test_learning_curve2.jsonlines')
    writer = NDJWriter(train_path, fs2)
    writer.write()
コード例 #2
0
def featureset_creation_from_dataframe_helper(with_labels, use_feature_hasher):
    """
    Helper function for the two unit tests for FeatureSet.from_data_frame().
    Since labels are optional, run two tests, one with, one without.
    """
    import pandas

    # First, setup the test data.
    # get a 100 instances with 4 features each
    X, y = make_classification(n_samples=100,
                               n_features=4,
                               n_informative=4,
                               n_redundant=0,
                               n_classes=3,
                               random_state=1234567890)

    # Not using 0 - 100 here because that would be pandas' default index names anyway.
    # So let's make sure pandas is using the ids we supply.
    ids = list(range(100, 200))

    featureset_name = 'test'

    # if use_feature_hashing, run these tests with a vectorizer
    feature_bins = 4
    vectorizer = (FeatureHasher(
        n_features=feature_bins) if use_feature_hasher else None)

    # convert the features into a list of dictionaries
    feature_names = ['f{}'.format(n) for n in range(1, 5)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    # Now, create a FeatureSet object.
    if with_labels:
        expected = FeatureSet(featureset_name,
                              ids,
                              features=features,
                              labels=y,
                              vectorizer=vectorizer)
    else:
        expected = FeatureSet(featureset_name,
                              ids,
                              features=features,
                              vectorizer=vectorizer)

    # Also create a DataFrame and then create a FeatureSet from it.
    df = pandas.DataFrame(features, index=ids)
    if with_labels:
        df['y'] = y
        current = FeatureSet.from_data_frame(df,
                                             featureset_name,
                                             labels_column='y',
                                             vectorizer=vectorizer)
    else:
        current = FeatureSet.from_data_frame(df,
                                             featureset_name,
                                             vectorizer=vectorizer)

    return (expected, current)
コード例 #3
0
ファイル: test_preprocessing.py プロジェクト: nimmen/skll
def make_scaling_data(use_feature_hashing=False):

    X, y = make_classification(n_samples=1000, n_classes=2,
                               n_features=5, n_informative=5,
                               n_redundant=0, random_state=1234567890)

    # we want to arbitrary scale the various features to test the scaling
    scalers = np.array([1, 10, 100, 1000, 10000])
    X = X * scalers

    # since we want to use SKLL's FeatureSet class, we need to
    # create a list of IDs
    ids = ['EXAMPLE_{}'.format(n) for n in range(1, 1001)]

    # create a list of dictionaries as the features
    feature_names = ['f{}'.format(n) for n in range(1, 6)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    # split everything into training and testing portions
    train_features, test_features = features[:800], features[800:]
    train_y, test_y = y[:800], y[800:]
    train_ids, test_ids = ids[:800], ids[800:]

    vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None
    train_fs = FeatureSet('train_scaling', train_ids,
                          features=train_features, labels=train_y,
                          vectorizer=vectorizer)
    test_fs = FeatureSet('test_scaling', test_ids,
                         features=test_features, labels=test_y,
                         vectorizer=vectorizer)

    return (train_fs, test_fs)
コード例 #4
0
def create_jsonlines_feature_files(path):

    # we only need to create the feature files if they
    # don't already exist under the given path
    feature_files_to_create = [
        join(path, 'f{}.jsonlines'.format(i)) for i in range(6)
    ]
    if all([exists(ff) for ff in feature_files_to_create]):
        return
    else:
        num_examples = 1000
        np.random.seed(1234567890)

        # Create lists we will write files from
        ids = []
        features = []
        labels = []
        for j in range(num_examples):
            y = "dog" if j % 2 == 0 else "cat"
            ex_id = "{}{}".format(y, j)
            x = {
                "f{}".format(feat_num): np.random.randint(0, 4)
                for feat_num in range(5)
            }
            x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
            ids.append(ex_id)
            labels.append(y)
            features.append(x)

        for i in range(5):
            file_path = join(path, 'f{}.jsonlines'.format(i))
            sub_features = []
            for example_num in range(num_examples):
                feat_num = i
                x = {
                    "f{}".format(feat_num):
                    features[example_num]["f{}".format(feat_num)]
                }
                sub_features.append(x)
            fs = FeatureSet('ablation_cv',
                            ids,
                            features=sub_features,
                            labels=labels)

            writer = NDJWriter(file_path, fs)
            writer.write()

        # now write out the last file which is basically
        # identical to the last featureset we wrote
        # except that it has two extra instances
        fs = FeatureSet(
            'extra',
            ids +
            ['cat{}'.format(num_examples), 'dog{}'.format(num_examples + 1)],
            features=sub_features + [{}, {}],
            labels=labels + ['cat', 'dog'])
        file_path = join(path, 'f5.jsonlines')
        writer = NDJWriter(file_path, fs)
        writer.write()
コード例 #5
0
ファイル: utils.py プロジェクト: latuji/skll
def make_sparse_data(use_feature_hashing=False):
    """
    Function to create sparse data with two features always zero
    in the training set and a different one always zero in the
    test set
    """
    # Create training data
    X, y = make_classification(n_samples=500, n_features=3,
                               n_informative=3, n_redundant=0,
                               n_classes=2, random_state=1234567890)

    # we need features to be non-negative since we will be
    # using naive bayes laster
    X = np.abs(X)

    # make sure that none of the features are zero
    X[np.where(X == 0)] += 1

    # since we want to use SKLL's FeatureSet class, we need to
    # create a list of IDs
    ids = ['EXAMPLE_{}'.format(n) for n in range(1, 501)]

    # create a list of dictionaries as the features
    # with f1 and f5 always 0
    feature_names = ['f{}'.format(n) for n in range(1, 6)]
    features = []
    for row in X:
        row = [0] + row.tolist() + [0]
        features.append(dict(zip(feature_names, row)))

    # use a FeatureHasher if we are asked to do feature hashing
    vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None
    train_fs = FeatureSet('train_sparse', ids,
                          features=features, labels=y,
                          vectorizer=vectorizer)

    # now create the test set with f4 always 0 but nothing else
    X, y = make_classification(n_samples=100, n_features=4,
                               n_informative=4, n_redundant=0,
                               n_classes=2, random_state=1234567890)
    X = np.abs(X)
    X[np.where(X == 0)] += 1
    ids = ['EXAMPLE_{}'.format(n) for n in range(1, 101)]

    # create a list of dictionaries as the features
    # with f4 always 0
    feature_names = ['f{}'.format(n) for n in range(1, 6)]
    features = []
    for row in X:
        row = row.tolist()
        row = row[:3] + [0] + row[3:]
        features.append(dict(zip(feature_names, row)))

    test_fs = FeatureSet('test_sparse', ids,
                         features=features, labels=y,
                         vectorizer=vectorizer)

    return train_fs, test_fs
コード例 #6
0
def make_regression_data(num_examples=100,
                         train_test_ratio=0.5,
                         num_features=2,
                         sd_noise=1.0,
                         use_feature_hashing=False,
                         feature_bins=4,
                         start_feature_num=1,
                         random_state=1234567890):

    # use sklearn's make_regression to generate the data for us
    X, y, weights = make_regression(n_samples=num_examples,
                                    n_features=num_features,
                                    noise=sd_noise,
                                    random_state=random_state,
                                    coef=True)

    # since we want to use SKLL's FeatureSet class, we need to
    # create a list of IDs
    ids = ['EXAMPLE_{}'.format(n) for n in range(1, num_examples + 1)]

    # create a list of dictionaries as the features
    feature_names = [
        'f{:02d}'.format(n)
        for n in range(start_feature_num, start_feature_num + num_features)
    ]
    features = [dict(zip(feature_names, row)) for row in X]

    # convert the weights array into a dictionary for convenience
    weightdict = dict(zip(feature_names, weights))

    # split everything into training and testing portions
    num_train_examples = int(round(train_test_ratio * num_examples))
    train_features, test_features = (features[:num_train_examples],
                                     features[num_train_examples:])
    train_y, test_y = y[:num_train_examples], y[num_train_examples:]
    train_ids, test_ids = ids[:num_train_examples], ids[num_train_examples:]

    # create a FeatureHasher if we are asked to use feature hashing
    # with the specified number of feature bins
    vectorizer = (FeatureHasher(
        n_features=feature_bins) if use_feature_hashing else None)
    train_fs = FeatureSet('regression_train',
                          train_ids,
                          labels=train_y,
                          features=train_features,
                          vectorizer=vectorizer)
    test_fs = FeatureSet('regression_test',
                         test_ids,
                         labels=test_y,
                         features=test_features,
                         vectorizer=vectorizer)

    return (train_fs, test_fs, weightdict)
コード例 #7
0
ファイル: test_featureset.py プロジェクト: BK-University/skll
def featureset_creation_from_dataframe_helper(with_labels, use_feature_hasher):
    """
    Helper function for the two unit tests for FeatureSet.from_data_frame().
    Since labels are optional, run two tests, one with, one without.
    """
    import pandas

    # First, setup the test data.
    # get a 100 instances with 4 features each
    X, y = make_classification(n_samples=100, n_features=4,
                               n_informative=4, n_redundant=0,
                               n_classes=3, random_state=1234567890)

    # Not using 0 - 100 here because that would be pandas' default index names anyway.
    # So let's make sure pandas is using the ids we supply.
    ids = list(range(100, 200))

    featureset_name = 'test'

    # if use_feature_hashing, run these tests with a vectorizer
    feature_bins = 4
    vectorizer = (FeatureHasher(n_features=feature_bins)
                  if use_feature_hasher else None)
    
    # convert the features into a list of dictionaries
    feature_names = ['f{}'.format(n) for n in range(1, 5)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    # Now, create a FeatureSet object.
    if with_labels:
        expected = FeatureSet(featureset_name, ids, features=features, labels=y,
                              vectorizer=vectorizer)
    else:
        expected = FeatureSet(featureset_name, ids, features=features,
                              vectorizer=vectorizer)

    # Also create a DataFrame and then create a FeatureSet from it.
    df = pandas.DataFrame(features, index=ids)
    if with_labels:
        df['y'] = y
        current = FeatureSet.from_data_frame(df, featureset_name, labels_column='y',
                                             vectorizer=vectorizer)
    else:
        current = FeatureSet.from_data_frame(df, featureset_name, vectorizer=vectorizer)

    return (expected, current)
コード例 #8
0
def test_featureset_creation_from_dataframe_with_string_labels():

    dftest = pd.DataFrame({
        "id": [1, 2],
        "score": ['yes', 'no'],
        "text": ["a b", "b c"]
    })
    dftest.set_index("id", inplace=True)
    test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}]
    test_dict_vectorizer = DictVectorizer()
    Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list)
    fs_test = FeatureSet('test',
                         ids=dftest.index.values,
                         labels=dftest['score'].values,
                         features=Xtest,
                         vectorizer=test_dict_vectorizer)

    output_path = join(_my_dir, "other", "test_string_labels_df.jsonlines")
    test_writer = NDJWriter(output_path, fs_test)
    test_writer.write()

    # read in the written file into a featureset and confirm that the
    # two featuresets are equal
    fs_test2 = NDJReader.for_path(output_path, ids_to_floats=True).read()

    assert fs_test == fs_test2
コード例 #9
0
def test_reading_csv_and_tsv_with_drop_blanks():

    # create CSV and TSV strings with blanks
    test_csv = '1,1,6\n2,,2\n3,9,3\n,,\n,5,\n,,\n2,7,7'
    test_tsv = test_csv.replace(',', '\t')

    # specify pandas_kwargs for CSV and TSV readers
    kwargs = {'header': None, 'names': ['A', 'B', 'C']}

    expected = pd.DataFrame(
        {
            'A': [1, 3, 2],
            'B': [1, 9, 7],
            'C': [6, 3, 7],
            'L': [None, None, None]
        },
        index=['EXAMPLE_0', 'EXAMPLE_1', 'EXAMPLE_2'])

    fs_expected = FeatureSet.from_data_frame(expected,
                                             'test',
                                             labels_column='L')

    fs_csv = CSVReader(StringIO(test_csv),
                       drop_blanks=True,
                       pandas_kwargs=kwargs).read()
    fs_csv.name = 'test'

    fs_tsv = TSVReader(StringIO(test_tsv),
                       drop_blanks=True,
                       pandas_kwargs=kwargs).read()
    fs_tsv.name = 'test'

    eq_(fs_csv, fs_expected)
    eq_(fs_tsv, fs_expected)
コード例 #10
0
def test_mismatch_labels_features():
    """
    Test to catch mistmatch between the shape of the labels vector and the feature matrix
    """

    # get a 100 instances with 4 features but ignore the labels we
    # get from here
    X, y = make_classification(n_samples=100,
                               n_features=4,
                               n_informative=4,
                               n_redundant=0,
                               n_classes=3,
                               random_state=1234567890)

    # double-stack y to ensure we don't match the number of feature rows
    y2 = np.hstack([y, y])

    # convert the features into a list of dictionaries
    feature_names = ['f{}'.format(n) for n in range(1, 5)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    # get 100 ids
    ids = ['EXAMPLE_{}'.format(i) for i in range(100)]

    # This should raise a ValueError
    FeatureSet('test', ids, features=features, labels=y2)
コード例 #11
0
def setup_cv_split_iterator(cv_folds, examples):
    """
    Set up a cross-validation split iterator over the given ``FeatureSet``.

    Parameters
    ----------
    cv_folds : int or dict
        The number of folds to use for cross-validation, or
        a mapping from example IDs to folds.
    examples : skll.FeatureSet
        The given featureset which is to be split.

    Returns
    -------
    res : a 2-tuple
        The first element is an iterator over the train/test featuresets
        and the second is the maximum number of training samples available.
    """
    # seed the random number generator for replicability
    random_state = np.random.RandomState(123456789)

    # set up the cross-validation split iterator with 20% of
    # the data always reserved for testing
    cv = ShuffleSplit(n_splits=cv_folds,
                      test_size=0.2,
                      random_state=random_state)
    cv_iter = list(cv.split(examples.features, examples.labels, None))
    n_max_training_samples = len(cv_iter[0][0])

    # create an iterator over train/test featuresets based on the
    # cross-validation index iterator
    featureset_iter = (FeatureSet.split_by_ids(examples, train, test)
                       for train, test in cv_iter)

    return featureset_iter, n_max_training_samples
コード例 #12
0
def check_dummy_classifier_predict(model_args, train_labels, expected_output):

    # create hard-coded featuresets based with known labels
    train_fs = FeatureSet('classification_train',
                          ['TrainExample{}'.format(i) for i in range(20)],
                          labels=train_labels,
                          features=[{"feature": i} for i in range(20)])

    test_fs = FeatureSet('classification_test',
                         ['TestExample{}'.format(i) for i in range(10)],
                         features=[{"feature": i} for i in range(20, 30)])

    # Ensure predictions are as expectedfor the given strategy
    learner = Learner('DummyClassifier', model_kwargs=model_args)
    learner.train(train_fs, grid_search=False)
    predictions = learner.predict(test_fs)
    eq_(np.array_equal(expected_output, predictions), True)
コード例 #13
0
def test_feature_merging_order_invariance():
    """
    Test whether featuresets with different orders of IDs can be merged
    """

    # First, randomly generate two feature sets and then make sure they have
    # the same labels.
    train_fs1, _, _ = make_regression_data()
    train_fs2, _, _ = make_regression_data(start_feature_num=3,
                                           random_state=87654321)
    train_fs2.labels = train_fs1.labels.copy()

    # make a reversed copy of feature set 2
    shuffled_indices = list(range(len(train_fs2.ids)))
    np.random.seed(123456789)
    np.random.shuffle(shuffled_indices)
    train_fs2_ids_shuf = train_fs2.ids[shuffled_indices]
    train_fs2_labels_shuf = train_fs2.labels[shuffled_indices]
    train_fs2_features_shuf = train_fs2.features[shuffled_indices]
    train_fs2_shuf = FeatureSet("f2_shuf",
                                train_fs2_ids_shuf,
                                labels=train_fs2_labels_shuf,
                                features=train_fs2_features_shuf,
                                vectorizer=train_fs2.vectorizer)

    # merge feature set 1 with feature set 2 and its reversed version
    merged_fs = train_fs1 + train_fs2
    merged_fs_shuf = train_fs1 + train_fs2_shuf

    # check that the two merged versions are the same
    feature_names = (train_fs1.vectorizer.get_feature_names() +
                     train_fs2.vectorizer.get_feature_names())
    assert_array_equal(merged_fs.vectorizer.get_feature_names(), feature_names)
    assert_array_equal(merged_fs_shuf.vectorizer.get_feature_names(),
                       feature_names)

    assert_array_equal(merged_fs.labels, train_fs1.labels)
    assert_array_equal(merged_fs.labels, train_fs2.labels)
    assert_array_equal(merged_fs.labels, merged_fs_shuf.labels)

    assert_array_equal(merged_fs.ids, train_fs1.ids)
    assert_array_equal(merged_fs.ids, train_fs2.ids)
    assert_array_equal(merged_fs.ids, merged_fs_shuf.ids)

    assert_array_equal(merged_fs.features[:, 0:2].todense(),
                       train_fs1.features.todense())
    assert_array_equal(merged_fs.features[:, 2:4].todense(),
                       train_fs2.features.todense())
    assert_array_equal(merged_fs.features.todense(),
                       merged_fs_shuf.features.todense())

    assert not np.all(
        merged_fs.features[:,
                           0:2].todense() == merged_fs.features[:,
                                                                2:4].todense())
コード例 #14
0
ファイル: test_cv.py プロジェクト: monkidea/skll
def make_cv_folds_data(num_examples_per_fold=100,
                       num_folds=3,
                       use_feature_hashing=False):
    """
    Create data for pre-specified CV folds tests
    with or without feature hashing
    """

    num_total_examples = num_examples_per_fold * num_folds

    # create the numeric features and the binary labels
    X, _ = make_classification(n_samples=num_total_examples,
                               n_features=3,
                               n_informative=3,
                               n_redundant=0,
                               n_classes=2,
                               random_state=1234567890)
    y = np.array([0, 1] * int(num_total_examples / 2))

    # the folds mapping: the first num_examples_per_fold examples
    # are in fold 1 the second num_examples_per_fold are in
    # fold 2 and so on
    foldgen = ([str(i)] * num_examples_per_fold for i in range(num_folds))
    folds = list(itertools.chain(*foldgen))

    # now create the list of feature dictionaries
    # and add the binary features that depend on
    # the class and fold number
    feature_names = ['f{}'.format(i) for i in range(1, 4)]
    features = []
    for row, classid, foldnum in zip(X, y, folds):
        string_feature_name = 'is_{}_{}'.format(classid, foldnum)
        string_feature_value = 1
        feat_dict = dict(zip(feature_names, row))
        feat_dict.update({string_feature_name: string_feature_value})
        features.append(feat_dict)

    # create the example IDs
    ids = [
        'EXAMPLE_{}'.format(num_examples_per_fold * k + i)
        for k in range(num_folds) for i in range(num_examples_per_fold)
    ]

    # create the cross-validation feature set with or without feature hashing
    vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None
    cv_fs = FeatureSet('cv_folds',
                       ids,
                       features=features,
                       labels=y,
                       vectorizer=vectorizer)

    # make the custom cv folds dictionary
    custom_cv_folds = dict(zip(ids, folds))

    return (cv_fs, custom_cv_folds)
コード例 #15
0
def make_class_map_data():
    # Create training file
    train_path = join(_my_dir, 'train', 'test_class_map.jsonlines')
    ids = []
    labels = []
    features = []
    class_names = ['beagle', 'cat', 'dachsund', 'cat']
    for i in range(1, 101):
        y = class_names[i % 4]
        ex_id = "{}{}".format(y, i)
        # note that f1 and f5 are missing in all instances but f4 is not
        x = {"f2": i + 1, "f3": i + 2, "f4": i + 5}
        ids.append(ex_id)
        labels.append(y)
        features.append(x)
    train_fs = FeatureSet('train_class_map',
                          ids,
                          features=features,
                          labels=labels)
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Create test file
    test_path = join(_my_dir, 'test', 'test_class_map.jsonlines')
    ids = []
    labels = []
    features = []
    for i in range(1, 51):
        y = class_names[i % 4]
        ex_id = "{}{}".format(y, i)
        # f1 and f5 are not missing in any instances here but f4 is
        x = {"f1": i, "f2": i + 2, "f3": i % 10, "f5": i * 2}
        ids.append(ex_id)
        labels.append(y)
        features.append(x)
    test_fs = FeatureSet('test_class_map',
                         ids,
                         features=features,
                         labels=labels)
    writer = NDJWriter(test_path, test_fs)
    writer.write()
コード例 #16
0
    def read(self):
        """
        Read examples from list of dictionaries.

        Returns
        -------
        feature_set : skll.FeatureSet
            FeatureSet representing the list of dictionaries we read in.
        """
        ids = []
        labels = []
        feat_dicts = []
        for example_num, example in enumerate(self.path_or_list):
            curr_id = str(example.get("id",
                                      "EXAMPLE_{}".format(example_num)))
            if self.ids_to_floats:
                try:
                    curr_id = float(curr_id)
                except ValueError:
                    raise ValueError(('You set ids_to_floats to true,' +
                                      ' but ID {} could not be ' +
                                      'converted to float in ' +
                                      '{}').format(curr_id, example))
            class_name = (safe_float(example['y'],
                                     replace_dict=self.class_map)
                          if 'y' in example else None)
            example = example['x']

            # Update lists of IDs, labels, and feature dictionaries
            if self.ids_to_floats:
                try:
                    curr_id = float(curr_id)
                except ValueError:
                    raise ValueError(('You set ids_to_floats to true, but ID '
                                      '{} could not be converted to float in '
                                      '{}').format(curr_id, self.path_or_list))
            ids.append(curr_id)
            labels.append(class_name)
            feat_dicts.append(example)

            # Print out status
            if example_num % 100 == 0:
                self._print_progress(example_num)

        # Convert lists to numpy arrays
        ids = np.array(ids)
        labels = np.array(labels)
        features = self.vectorizer.fit_transform(feat_dicts)

        return FeatureSet('converted', ids, labels=labels,
                          features=features, vectorizer=self.vectorizer)
コード例 #17
0
def test_dummy_classifier_predict():
    # hard-code dataset
    train_fs = FeatureSet('classification_train',
                          ['TrainExample{}'.format(i) for i in range(20)],
                          labels=([0] * 14) + ([1] * 6),
                          features=[{
                              "feature": i
                          } for i in range(20)])

    test_fs = FeatureSet('classification_test',
                         ['TestExample{}'.format(i) for i in range(10)],
                         features=[{
                             "feature": i
                         } for i in range(20, 30)])

    toy_data = ([{
        "strategy": "stratified",
        "random_state": 12345
    },
                 np.array([1, 0, 0, 0, 0, 0, 1, 0, 1,
                           0])], [{
                               "strategy": "most_frequent"
                           },
                                  np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])],
                [{
                    "strategy": "constant",
                    "constant": 1
                },
                 np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])])

    # Ensure predictions are correct for all strategies.
    correct = []
    for model_args, expected_output in toy_data:
        learner = Learner('DummyClassifier', model_kwargs=model_args)
        learner.train(train_fs)
        predictions = learner.predict(test_fs)
        correct.append(np.array_equal(expected_output, predictions))
    eq_(correct, [True, True, True])
コード例 #18
0
ファイル: test_classification.py プロジェクト: ofergold/skll
def make_float_class_data():
    """
    We want to create data that has labels that look like
    floats to make sure they are preserved correctly
    """

    ids = ['EXAMPLE_{}'.format(n) for n in range(1, 76)]
    y = [1.2] * 25 + [1.5] * 25 + [1.8] * 25
    X = np.vstack([np.identity(25), np.identity(25), np.identity(25)])
    feature_names = ['f{}'.format(i) for i in range(1, 6)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    return FeatureSet('float-classes', ids, features=features, labels=y)
コード例 #19
0
def test_learning_curve_implementation():
    """
    Test to ensure that the learning curve results match scikit-learn
    """

    # This test is different from the other tests which just use regression data.
    # The reason is that we want this test to fail in case our implementation
    # diverges from the scikit-learn implementation. This test essentially
    # serves as a regression test as well.

    # Load in the digits data set
    digits = load_digits()
    X, y = digits.data, digits.target

    # get the learning curve results from scikit-learn for this data
    cv_folds = 10
    random_state = 123456789
    cv = ShuffleSplit(n_splits=cv_folds, test_size=0.2, random_state=random_state)
    estimator = MultinomialNB()
    train_sizes = np.linspace(.1, 1.0, 5)
    train_sizes1, train_scores1, test_scores1 = learning_curve(estimator,
                                                               X,
                                                               y,
                                                               cv=cv,
                                                               train_sizes=train_sizes,
                                                               scoring='accuracy')

    # get the features from this data into a FeatureSet instance we can use
    # with the SKLL API
    feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))
    fs = FeatureSet('train', features=features, labels=y, ids=list(range(X.shape[0])))

    # we don't want to filter out any features since scikit-learn
    # does not do that either
    learner = Learner('MultinomialNB', min_feature_count=0)
    (train_scores2,
     test_scores2,
     train_sizes2) = learner.learning_curve(fs,
                                            cv_folds=cv_folds,
                                            train_sizes=train_sizes,
                                            metric='accuracy')

    assert np.all(train_sizes1 == train_sizes2)
    assert np.allclose(train_scores1, train_scores2)
    assert np.allclose(test_scores1, test_scores2)
コード例 #20
0
def make_rare_class_data():
    """
    We want to create data that has five instances per class, for three labels
    and for each instance within the group of 5, there's only a single feature
    firing
    """

    ids = ['EXAMPLE_{}'.format(n) for n in range(1, 16)]
    y = [0] * 5 + [1] * 5 + [2] * 5
    X = np.vstack([np.identity(5), np.identity(5), np.identity(5)])
    feature_names = ['f{}'.format(i) for i in range(1, 6)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    return FeatureSet('rare-class', ids, features=features, labels=y)
コード例 #21
0
    def read(self):
        """
        Loads examples in the `.arff`, `.csv`, `.jsonlines`, `.libsvm`,
        `.ndj`, or `.tsv` formats.

        Returns
        -------
        feature_set : skll.FeatureSet
            ``FeatureSet`` instance representing the input file.

        Raises
        ------
        ValueError
            If ``ids_to_floats`` is True, but IDs cannot be converted.
        ValueError
            If no features are found.
        ValueError
            If the example IDs are not unique.
        """
        self.logger.debug('Path: %s', self.path_or_list)

        if not self.quiet:
            self._progress_msg = "Loading {}...".format(self.path_or_list)
            print(self._progress_msg, end="\r", file=sys.stderr)
            sys.stderr.flush()

        if self._use_pandas:
            ids, labels, features = self._sub_read(self.path_or_list)
        else:
            ids, labels, features = self._sub_read_rows(self.path_or_list)

        # Convert everything to numpy arrays
        features = self.vectorizer.fit_transform(features)

        # Report that loading is complete
        self._print_progress("done", end="\n")

        # Make sure we have the same number of ids, labels, and features
        assert ids.shape[0] == labels.shape[0] == features.shape[0]

        if ids.shape[0] != len(set(ids)):
            raise ValueError('The example IDs are not unique in %s.' %
                             self.path_or_list)

        return FeatureSet(self.path_or_list, ids, labels=labels,
                          features=features, vectorizer=self.vectorizer)
コード例 #22
0
def create_jsonlines_feature_files(path):

    # we only need to create the feature files if they
    # don't already exist under the given path
    feature_files_to_create = [
        join(path, 'f{}.jsonlines'.format(i)) for i in range(5)
    ]
    if all([exists(ff) for ff in feature_files_to_create]):
        return
    else:
        num_examples = 1000
        np.random.seed(1234567890)

        # Create lists we will write files from
        ids = []
        features = []
        labels = []
        for j in range(num_examples):
            y = "dog" if j % 2 == 0 else "cat"
            ex_id = "{}{}".format(y, j)
            x = {
                "f{}".format(feat_num): np.random.randint(0, 4)
                for feat_num in range(5)
            }
            x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
            ids.append(ex_id)
            labels.append(y)
            features.append(x)

        for i in range(5):
            file_path = join(path, 'f{}.jsonlines'.format(i))
            sub_features = []
            for example_num in range(num_examples):
                feat_num = i
                x = {
                    "f{}".format(feat_num):
                    features[example_num]["f{}".format(feat_num)]
                }
                sub_features.append(x)
            fs = FeatureSet('ablation_cv',
                            ids,
                            features=sub_features,
                            labels=labels)
            writer = NDJWriter(file_path, fs)
            writer.write()
コード例 #23
0
def test_writing_ndj_featureset_with_string_ids():
    test_dict_vectorizer = DictVectorizer()
    test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}]
    Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list)
    fs_test = FeatureSet('test',
                         ids=['1', '2'],
                         labels=[1, 2],
                         features=Xtest,
                         vectorizer=test_dict_vectorizer)
    output_path = join(_my_dir, "other", "test_string_ids.jsonlines")
    test_writer = NDJWriter(output_path, fs_test)
    test_writer.write()

    # read in the written file into a featureset and confirm that the
    # two featuresets are equal
    fs_test2 = NDJReader.for_path(output_path).read()

    assert fs_test == fs_test2
コード例 #24
0
ファイル: test_featureset.py プロジェクト: monkidea/skll
def test_empty_ids():
    """
    Test to ensure that an error is raised if ids is None
    """

    # get a 100 instances with 4 features each
    X, y = make_classification(n_samples=100, n_features=4,
                               n_informative=4, n_redundant=0,
                               n_classes=3, random_state=1234567890)

    # convert the features into a list of dictionaries
    feature_names = ['f{}'.format(n) for n in range(1, 5)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    # create a feature set with ids set to None and raise ValueError
    FeatureSet('test', None, features=features, labels=y)
コード例 #25
0
def make_ablation_data():
    # Remove old CV data
    for old_file in glob.glob(join(_my_dir, 'output',
                                   'ablation_cv_*.results')):
        os.remove(old_file)

    num_examples = 1000

    np.random.seed(1234567890)

    # Create lists we will write files from
    ids = []
    features = []
    labels = []
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        x = {
            "f{}".format(feat_num): np.random.randint(0, 4)
            for feat_num in range(5)
        }
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        labels.append(y)
        features.append(x)

    for i in range(5):
        train_path = join(_my_dir, 'train', 'f{}.jsonlines'.format(i))
        sub_features = []
        for example_num in range(num_examples):
            feat_num = i
            x = {
                "f{}".format(feat_num):
                features[example_num]["f{}".format(feat_num)]
            }
            sub_features.append(x)
        train_fs = FeatureSet('ablation_cv',
                              ids,
                              features=sub_features,
                              labels=labels)
        writer = NDJWriter(train_path, train_fs)
        writer.write()
コード例 #26
0
ファイル: test_featureset.py プロジェクト: monkidea/skll
def test_mismatch_ids_features():
    """
    Test to catch mistmatch between the shape of the ids vector and the feature matrix
    """

    # get a 100 instances with 4 features each
    X, y = make_classification(n_samples=100, n_features=4,
                               n_informative=4, n_redundant=0,
                               n_classes=3, random_state=1234567890)

    # convert the features into a list of dictionaries
    feature_names = ['f{}'.format(n) for n in range(1, 5)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    # get 200 ids since we don't want to match the number of feature rows
    ids = ['EXAMPLE_{}'.format(i) for i in range(200)]

    # This should raise a ValueError
    FeatureSet('test', ids, features=features, labels=y)
コード例 #27
0
def make_merging_data(num_feat_files, suffix, numeric_ids):
    num_examples = 500
    num_feats_per_file = 17

    np.random.seed(1234567890)

    merge_dir = join(_my_dir, 'train', 'test_merging')
    if not exists(merge_dir):
        os.makedirs(merge_dir)

    # Create lists we will write files from
    ids = []
    features = []
    labels = []
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j) if not numeric_ids else j
        x = {
            "f{:03d}".format(feat_num): np.random.randint(0, 4)
            for feat_num in range(num_feat_files * num_feats_per_file)
        }
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        labels.append(y)
        features.append(x)

    # Unmerged
    subset_dict = {}
    for i in range(num_feat_files):
        feat_num = i * num_feats_per_file
        subset_dict['{}'.format(i)] = [
            "f{:03d}".format(feat_num + j) for j in range(num_feats_per_file)
        ]
    train_path = join(merge_dir, suffix)
    train_fs = FeatureSet('train', ids, labels=labels, features=features)
    Writer.for_path(train_path, train_fs, subsets=subset_dict).write()

    # Merged
    train_path = join(merge_dir, 'all{}'.format(suffix))
    Writer.for_path(train_path, train_fs).write()
コード例 #28
0
def test_reading_csv_and_tsv_with_fill_blanks_with_dictionary():

    # create CSV and TSV strings with blanks
    test_csv = '1,1,6\n2,,2\n3,9,3\n,,\n,5,\n,,\n2,7,7'
    test_tsv = test_csv.replace(',', '\t')

    # specify pandas_kwargs for CSV and TSV readers
    kwargs = {'header': None, 'names': ['A', 'B', 'C']}

    expected = pd.DataFrame(
        {
            'A': [1, 2, 3, 4.5, 4.5, 4.5, 2],
            'B': [1, 2.5, 9, 2.5, 5, 2.5, 7],
            'C': [6, 2, 3, 1, 1, 1, 7],
            'L': [None, None, None, None, None, None, None]
        },
        index=[
            'EXAMPLE_0', 'EXAMPLE_1', 'EXAMPLE_2', 'EXAMPLE_3', 'EXAMPLE_4',
            'EXAMPLE_5', 'EXAMPLE_6'
        ])

    fs_expected = FeatureSet.from_data_frame(expected,
                                             'test',
                                             labels_column='L')

    replacement_dict = {'A': 4.5, 'B': 2.5, 'C': 1}
    fs_csv = CSVReader(StringIO(test_csv),
                       replace_blanks_with=replacement_dict,
                       pandas_kwargs=kwargs).read()
    fs_csv.name = 'test'

    fs_tsv = TSVReader(StringIO(test_tsv),
                       replace_blanks_with=replacement_dict,
                       pandas_kwargs=kwargs).read()
    fs_tsv.name = 'test'

    eq_(fs_csv, fs_expected)
    eq_(fs_tsv, fs_expected)
コード例 #29
0
def make_regression_data(num_examples=100,
                         train_test_ratio=0.5,
                         num_features=2,
                         sd_noise=1.0,
                         use_feature_hashing=False,
                         feature_bins=4,
                         start_feature_num=1,
                         random_state=1234567890):

    # if we are doing feature hashing and we have asked for more
    # feature bins than number of total features, we need to
    # handle that because `make_regression()` doesn't know
    # about hashing
    if use_feature_hashing and num_features < feature_bins:
        num_features = feature_bins

    # use sklearn's make_regression to generate the data for us
    X, y, weights = make_regression(n_samples=num_examples,
                                    n_features=num_features,
                                    noise=sd_noise,
                                    random_state=random_state,
                                    coef=True)

    # since we want to use SKLL's FeatureSet class, we need to
    # create a list of IDs
    ids = ['EXAMPLE_{}'.format(n) for n in range(1, num_examples + 1)]

    # create a list of dictionaries as the features
    index_width_for_feature_name = int(floor(log10(num_features))) + 1
    feature_names = []
    for n in range(start_feature_num, start_feature_num + num_features):
        index_str = str(n).zfill(index_width_for_feature_name)
        feature_name = 'f{}'.format(index_str)
        feature_names.append(feature_name)
    features = [dict(zip(feature_names, row)) for row in X]

    # At this point the labels are generated using unhashed features
    # even if we want to do feature hashing. `make_regression()` from
    # sklearn doesn't know anything about feature hashing, so we need
    # a hack here to compute the updated labels ourselves
    # using the same command that sklearn uses inside `make_regression()`
    # which is to generate the X and the weights and then compute the
    # y as the dot product of the two. This y will then be used as our
    # labels instead of the original y we got from `make_regression()`.
    # Note that we only want to use the number of weights that are
    # equal to the number of feature bins for the hashing
    if use_feature_hashing:
        feature_hasher = FeatureHasher(n_features=feature_bins)
        hashed_X = feature_hasher.fit_transform(features)
        y = hashed_X.dot(weights[:feature_bins])

    # convert the weights array into a dictionary for convenience
    # if we are using feature hashing, we need to use the names
    # that would be output by `model_params()` instead of the
    # original names since that's what we would get from SKLL
    if use_feature_hashing:
        index_width_for_feature_name = int(floor(log10(feature_bins))) + 1
        hashed_feature_names = []
        for i in range(feature_bins):
            index_str = str(i + 1).zfill(index_width_for_feature_name)
            feature_name = 'hashed_feature_{}'.format(index_str)
            hashed_feature_names.append(feature_name)
        weightdict = dict(zip(hashed_feature_names, weights[:feature_bins]))
    else:
        weightdict = dict(zip(feature_names, weights))

    # split everything into training and testing portions
    num_train_examples = int(round(train_test_ratio * num_examples))
    train_features, test_features = (features[:num_train_examples],
                                     features[num_train_examples:])
    train_y, test_y = y[:num_train_examples], y[num_train_examples:]
    train_ids, test_ids = ids[:num_train_examples], ids[num_train_examples:]

    # create a FeatureHasher if we are asked to use feature hashing
    # with the specified number of feature bins
    vectorizer = (FeatureHasher(
        n_features=feature_bins) if use_feature_hashing else None)
    train_fs = FeatureSet('regression_train',
                          train_ids,
                          labels=train_y,
                          features=train_features,
                          vectorizer=vectorizer)
    test_fs = FeatureSet('regression_test',
                         test_ids,
                         labels=test_y,
                         features=test_features,
                         vectorizer=vectorizer)

    return (train_fs, test_fs, weightdict)
コード例 #30
0
def make_classification_data(num_examples=100,
                             train_test_ratio=0.5,
                             num_features=10,
                             use_feature_hashing=False,
                             feature_bins=4,
                             num_labels=2,
                             empty_labels=False,
                             string_label_list=None,
                             feature_prefix='f',
                             id_type='string',
                             class_weights=None,
                             non_negative=False,
                             one_string_feature=False,
                             num_string_values=4,
                             random_state=1234567890):

    # use sklearn's make_classification to generate the data for us
    num_numeric_features = (num_features -
                            1 if one_string_feature else num_features)
    X, y = make_classification(n_samples=num_examples,
                               n_features=num_numeric_features,
                               n_informative=num_numeric_features,
                               n_redundant=0,
                               n_classes=num_labels,
                               weights=class_weights,
                               random_state=random_state)

    if string_label_list:
        assert (len(string_label_list) == num_labels)
        label_to_string = np.vectorize(lambda n: string_label_list[n])
        y = label_to_string(y)

    # if we were told to only generate non-negative features, then
    # we can simply take the absolute values of the generated features
    if non_negative:
        X = abs(X)

    # since we want to use SKLL's FeatureSet class, we need to
    # create a list of IDs; we create IDs that either can also
    # be numbers or pure strings
    if id_type == 'string':
        ids = ['EXAMPLE_{}'.format(n) for n in range(1, num_examples + 1)]
    elif id_type == 'integer_string':
        ids = ['{}'.format(n) for n in range(1, num_examples + 1)]
    elif id_type == 'float':
        ids = [float(n) for n in range(1, num_examples + 1)]
    elif id_type == 'integer':
        ids = list(range(1, num_examples + 1))

    # create a string feature that has four possible values
    # 'a', 'b', 'c' and 'd' and add it to X at the end
    if one_string_feature:
        prng = RandomState(random_state)
        random_indices = prng.random_integers(0, num_string_values - 1,
                                              num_examples)
        possible_values = [chr(x) for x in range(97, 97 + num_string_values)]
        string_feature_values = [possible_values[i] for i in random_indices]
        string_feature_column = np.array(string_feature_values,
                                         dtype=object).reshape(100, 1)
        X = np.append(X, string_feature_column, 1)

    # create a list of dictionaries as the features
    feature_names = [
        '{}{:02d}'.format(feature_prefix, n)
        for n in range(1, num_features + 1)
    ]
    features = [dict(zip(feature_names, row)) for row in X]

    # split everything into training and testing portions
    num_train_examples = int(round(train_test_ratio * num_examples))
    train_features, test_features = (features[:num_train_examples],
                                     features[num_train_examples:])
    train_y, test_y = y[:num_train_examples], y[num_train_examples:]
    train_ids, test_ids = ids[:num_train_examples], ids[num_train_examples:]

    # are we told to generate empty labels
    train_labels = None if empty_labels else train_y
    test_labels = None if empty_labels else test_y

    # create a FeatureHasher if we are asked to use feature hashing
    # with the specified number of feature bins
    vectorizer = (FeatureHasher(
        n_features=feature_bins) if use_feature_hashing else None)
    train_fs = FeatureSet('classification_train',
                          train_ids,
                          labels=train_labels,
                          features=train_features,
                          vectorizer=vectorizer)
    if train_test_ratio < 1.0:
        test_fs = FeatureSet('classification_test',
                             test_ids,
                             labels=test_labels,
                             features=test_features,
                             vectorizer=vectorizer)
    else:
        test_fs = None

    return (train_fs, test_fs)
コード例 #31
0
def make_conversion_data(num_feat_files, from_suffix, to_suffix, with_labels=True):
    num_examples = 500
    num_feats_per_file = 7

    np.random.seed(1234567890)

    convert_dir = join(_my_dir, 'train', 'test_conversion')
    if not exists(convert_dir):
        os.makedirs(convert_dir)

    # Create lists we will write files from
    ids = []
    features = []
    labels = [] if with_labels else None
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        # if we are not using labels, we do not want zero-valued features
        # because it may be the case that some subset of features end up
        # being all 0 and if this subset ends up being written out to a file
        # below, then for some formats (e.g., megam) nothing will get written
        # out which can cause issues when reading this file
        lowest_feature_value = 0 if with_labels else 1
        x = {"f{:03d}".format(feat_num): np.random.randint(lowest_feature_value, 4 + lowest_feature_value) for feat_num
             in range(num_feat_files * num_feats_per_file)}
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        if with_labels:
            labels.append(y)
        features.append(x)

    # Create vectorizers/maps for libsvm subset writing
    feat_vectorizer = DictVectorizer()
    feat_vectorizer.fit(features)
    if with_labels:
        label_map = {label: num for num, label in
                     enumerate(sorted({label for label in labels if
                                       not isinstance(label, (int, float))}))}
        # Add fake item to vectorizer for None
        label_map[None] = '00000'
    else:
        label_map = None

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # use '_unlabeled' as part of any file names when not using labels
    with_labels_part = '' if with_labels else '_unlabeled'

    # Write out unmerged features in the `from_suffix` file format
    for i in range(num_feat_files):
        train_path = join(convert_dir, '{}_{}{}{}'.format(feature_name_prefix,
                                                          i,
                                                          with_labels_part,
                                                          from_suffix))
        sub_features = []
        for example_num in range(num_examples):
            feat_num = i * num_feats_per_file
            x = {"f{:03d}".format(feat_num + j):
                 features[example_num]["f{:03d}".format(feat_num + j)] for j in
                 range(num_feats_per_file)}
            sub_features.append(x)
        train_fs = FeatureSet('sub_train', ids, labels=labels,
                              features=sub_features,
                              vectorizer=feat_vectorizer)
        if from_suffix == '.libsvm':
            Writer.for_path(train_path, train_fs,
                            label_map=label_map).write()
        elif from_suffix in ['.arff', '.csv', '.tsv']:
            label_col = 'y' if with_labels else None
            Writer.for_path(train_path, train_fs, label_col=label_col).write()
        else:
            Writer.for_path(train_path, train_fs).write()

    # Write out the merged features in the `to_suffix` file format
    train_path = join(convert_dir, '{}{}_all{}'.format(feature_name_prefix,
                                                       with_labels_part,
                                                       to_suffix))
    train_fs = FeatureSet('train', ids, labels=labels, features=features,
                          vectorizer=feat_vectorizer)

    # we need to do this to get around the FeatureSet using NaNs
    # instead of None when there are no labels which causes problems
    # later when comparing featuresets
    if not with_labels:
        train_fs.labels = [None] * len(train_fs.labels)

    if to_suffix == '.libsvm':
        Writer.for_path(train_path, train_fs,
                        label_map=label_map).write()
    elif to_suffix in ['.arff', '.csv', '.tsv']:
        label_col = 'y' if with_labels else None
        Writer.for_path(train_path, train_fs, label_col=label_col).write()
    else:
        Writer.for_path(train_path, train_fs).write()
コード例 #32
0
def make_conversion_data(num_feat_files,
                         from_suffix,
                         to_suffix,
                         with_labels=True):
    num_examples = 500
    num_feats_per_file = 7

    np.random.seed(1234567890)

    convert_dir = join(_my_dir, 'train', 'test_conversion')
    if not exists(convert_dir):
        os.makedirs(convert_dir)

    # Create lists we will write files from
    ids = []
    features = []
    labels = [] if with_labels else None
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        # if we are not using labels, we do not want zero-valued features
        # because it may be the case that some subset of features end up
        # being all 0 and if this subset ends up being written out to a file
        # below, then for some formats (e.g., megam) nothing will get written
        # out which can cause issues when reading this file
        lowest_feature_value = 0 if with_labels else 1
        x = {
            "f{:03d}".format(feat_num):
            np.random.randint(lowest_feature_value, 4 + lowest_feature_value)
            for feat_num in range(num_feat_files * num_feats_per_file)
        }
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        if with_labels:
            labels.append(y)
        features.append(x)

    # Create vectorizers/maps for libsvm subset writing
    feat_vectorizer = DictVectorizer()
    feat_vectorizer.fit(features)
    if with_labels:
        label_map = {
            label: num
            for num, label in enumerate(
                sorted({
                    label
                    for label in labels if not isinstance(label, (int, float))
                }))
        }
        # Add fake item to vectorizer for None
        label_map[None] = '00000'
    else:
        label_map = None

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # use '_unlabeled' as part of any file names when not using labels
    with_labels_part = '' if with_labels else '_unlabeled'

    # Write out unmerged features in the `from_suffix` file format
    for i in range(num_feat_files):
        train_path = join(
            convert_dir, '{}_{}{}{}'.format(feature_name_prefix, i,
                                            with_labels_part, from_suffix))
        sub_features = []
        for example_num in range(num_examples):
            feat_num = i * num_feats_per_file
            x = {
                "f{:03d}".format(feat_num + j):
                features[example_num]["f{:03d}".format(feat_num + j)]
                for j in range(num_feats_per_file)
            }
            sub_features.append(x)
        train_fs = FeatureSet('sub_train',
                              ids,
                              labels=labels,
                              features=sub_features,
                              vectorizer=feat_vectorizer)
        if from_suffix == '.libsvm':
            Writer.for_path(train_path, train_fs, label_map=label_map).write()
        elif from_suffix in ['.arff', '.csv', '.tsv']:
            label_col = 'y' if with_labels else None
            Writer.for_path(train_path, train_fs, label_col=label_col).write()
        else:
            Writer.for_path(train_path, train_fs).write()

    # Write out the merged features in the `to_suffix` file format
    train_path = join(
        convert_dir, '{}{}_all{}'.format(feature_name_prefix, with_labels_part,
                                         to_suffix))
    train_fs = FeatureSet('train',
                          ids,
                          labels=labels,
                          features=features,
                          vectorizer=feat_vectorizer)

    # we need to do this to get around the FeatureSet using NaNs
    # instead of None when there are no labels which causes problems
    # later when comparing featuresets
    if not with_labels:
        train_fs.labels = [None] * len(train_fs.labels)

    if to_suffix == '.libsvm':
        Writer.for_path(train_path, train_fs, label_map=label_map).write()
    elif to_suffix in ['.arff', '.csv', '.tsv']:
        label_col = 'y' if with_labels else None
        Writer.for_path(train_path, train_fs, label_col=label_col).write()
    else:
        Writer.for_path(train_path, train_fs).write()