def test_calculate_all_metafeatures_multilabel(multilabel_train_data):
    meta_features.helper_functions.clear()
    X, y = multilabel_train_data
    categorical = {i: False for i in range(10)}
    mf = meta_features.calculate_all_metafeatures(
        X, y,  categorical, "Generated", logger=logging.getLogger('TestMeta'))
    assert 52 == len(mf.metafeature_values)
 def test_calculate_all_metafeatures(self):
     mf = meta_features.calculate_all_metafeatures(self.X, self.y,
                                                   self.categorical, "2")
     self.assertEqual(52, len(mf.metafeature_values))
     self.assertEqual(
         mf.metafeature_values['NumberOfCategoricalFeatures'].value, 32)
     sio = StringIO()
     mf.dump(sio)
예제 #3
0
 def test_calculate_all_metafeatures(self):
     mf = meta_features.calculate_all_metafeatures(
         self.X, self.y, self.categorical, "2")
     self.assertEqual(52, len(mf.metafeature_values))
     self.assertEqual(mf.metafeature_values[
                          'NumberOfCategoricalFeatures'].value, 32)
     sio = StringIO()
     mf.dump(sio)
 def test_calculate_all_metafeatures_multilabel(self):
     self.helpers.clear()
     X, y = self.get_multilabel()
     categorical = [False] * 10
     mf = meta_features.calculate_all_metafeatures(X, y, categorical,
                                                   "Generated")
     self.assertEqual(52, len(mf.metafeature_values))
     sio = StringIO()
     mf.dump(sio)
예제 #5
0
 def test_calculate_all_metafeatures_multilabel(self):
     self.helpers.clear()
     X, y = self.get_multilabel()
     categorical = [False] * 10
     mf = meta_features.calculate_all_metafeatures(
         X, y, categorical, "Generated")
     self.assertEqual(52, len(mf.metafeature_values))
     sio = StringIO()
     mf.dump(sio)
예제 #6
0
def getFeatures(x, y, name):
    """
    x: numpy array, X of the dataset
    y: 1-D numpy array, Y of the dataset
    name: string, name of the dataset

    Returns: DatasetMetafeatures object
    (found in autosklearn/metalearning/metafeatures/metafeature.py)
    """
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        metafeatures_labels = metafeatures.calculate_all_metafeatures(x, y, [False] * x.shape[1], name)
    return metafeatures_labels
def test_calculate_all_metafeatures(sparse_data):
    X, y, categorical = sparse_data
    mf = meta_features.calculate_all_metafeatures(
        X, y, categorical, "2", logger=logging.getLogger('Meta'))
    assert 52 == len(mf.metafeature_values)
def test_calculate_all_metafeatures_same_results_across_datatypes():
    """
    This test makes sure that numpy and pandas produce the same metafeatures.
    This also is an excuse to fully test anneal dataset, and make sure
    all metafeatures work in this complex dataset
    """
    X, y = fetch_openml(data_id=2, return_X_y=True, as_frame=True)
    categorical = {col: True if X[col].dtype.name == 'category' else False
                   for col in X.columns}
    mf = meta_features.calculate_all_metafeatures(
        X, y, categorical, "2", logger=logging.getLogger('Meta'))
    assert 52 == len(mf.metafeature_values)
    expected = {
        'PCASkewnessFirstPC': 0.41897660337677867,
        'PCAKurtosisFirstPC': -0.677692541156901,
        'PCAFractionOfComponentsFor95PercentVariance': 0.2716049382716049,
        'ClassEntropy': 1.1898338562043977,
        'SkewnessSTD': 7.540418815675546,
        'SkewnessMean': 1.47397188548894,
        'SkewnessMax': 29.916569235579203,
        'SkewnessMin': -29.916569235579203,
        'KurtosisSTD': 153.0563504598898,
        'KurtosisMean': 56.998860939761165,
        'KurtosisMax': 893.0011148272025,
        'KurtosisMin': -3.0,
        'SymbolsSum': 49,
        'SymbolsSTD': 1.3679553264445183,
        'SymbolsMean': 1.8846153846153846,
        'SymbolsMax': 7,
        'SymbolsMin': 1,
        'ClassProbabilitySTD': 0.28282850691819206,
        'ClassProbabilityMean': 0.2,
        'ClassProbabilityMax': 0.7616926503340757,
        'ClassProbabilityMin': 0.008908685968819599,
        'InverseDatasetRatio': 23.63157894736842,
        'DatasetRatio': 0.042316258351893093,
        'RatioNominalToNumerical': 5.333333333333333,
        'RatioNumericalToNominal': 0.1875,
        'NumberOfCategoricalFeatures': 32,
        'NumberOfNumericFeatures': 6,
        'NumberOfMissingValues': 22175.0,
        'NumberOfFeaturesWithMissingValues': 29.0,
        'NumberOfInstancesWithMissingValues': 898.0,
        'NumberOfFeatures': 38.0,
        'NumberOfClasses': 5.0,
        'NumberOfInstances': 898.0,
        'LogInverseDatasetRatio': 3.162583908575814,
        'LogDatasetRatio': -3.162583908575814,
        'PercentageOfMissingValues': 0.6498358926268901,
        'PercentageOfFeaturesWithMissingValues': 0.7631578947368421,
        'PercentageOfInstancesWithMissingValues': 1.0,
        'LogNumberOfFeatures': 3.6375861597263857,
        'LogNumberOfInstances': 6.8001700683022,
    }
    assert {k: mf[k].value for k in expected.keys()} == pytest.approx(expected)

    expected_landmarks = {
        'Landmark1NN': 0.9721601489757914,
        'LandmarkRandomNodeLearner': 0.7616945996275606,
        'LandmarkDecisionNodeLearner':  0.7827932960893855,
        'LandmarkDecisionTree': 0.9899875853507139,
        'LandmarkNaiveBayes': 0.9287150837988827,
        'LandmarkLDA': 0.9610242085661079,
    }
    assert {k: mf[k].value for k in expected_landmarks.keys()} == pytest.approx(
        expected_landmarks, rel=1e-5)

    # Then do numpy!
    X, y = fetch_openml(data_id=2, return_X_y=True, as_frame=False)
    categorical = {i: True if category else False
                   for i, category in enumerate(categorical.values())}
    mf = meta_features.calculate_all_metafeatures(
        X, y, categorical, "2", logger=logging.getLogger('Meta'))
    assert {k: mf[k].value for k in expected.keys()} == pytest.approx(expected)

    # The column-reorder of pandas and numpy array are different after
    # the data preprocessing. So we cannot directly compare, and landmarking is
    # sensible to column order
    expected_landmarks['LandmarkDecisionTree'] = 0.9922098075729361
    assert {k: mf[k].value for k in expected_landmarks.keys()} == pytest.approx(
        expected_landmarks, rel=1e-5)
def test_calculate_all_metafeatures(meta_train_data):
    X, y, categorical = meta_train_data
    mf = meta_features.calculate_all_metafeatures(
        X, y, categorical, "2", logger=logging.getLogger('Meta'))
    assert 52 == len(mf.metafeature_values)
    assert mf.metafeature_values['NumberOfCategoricalFeatures'].value == 32