def make_fixture(binary=False, balanced=False, split=False):
    """
    Make a dataset for testing ClassBalance based on the specified params.
    """
    kwargs = {
        "n_samples": 100,
        "n_features": 20,
        "n_informative": 8,
        "n_redundant": 2,
        "n_clusters_per_class": 1,
        "random_state": 89092,
    }

    if binary:
        kwargs["n_classes"] = 2
        kwargs["weights"] = None if balanced else [0.3, 0.7]
    else:
        kwargs["n_classes"] = 5
        kwargs["weights"] = None if balanced else [0.1, 0.2, 0.4, 0.2, 0.01]

    X, y = make_classification(**kwargs)

    if split:
        X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=101)
        return Dataset(Split(X_train, X_test), Split(y_train, y_test))

    return Dataset(X, y)
def s_curves(request):
    """
    Creates a random regressor fixture.
    """
    X, y = make_s_curve(1000, random_state=888)
    # Set a class attribute for continuous data
    request.cls.s_curves = Dataset(X, y)
def clusters(request):
    # TODO: replace with make_blobs
    X = np.array([
        [-0.40020753, -4.67055317, -0.27191127, -1.49156318],
        [0.37143349, -4.89391622, -1.23893945, 0.48318165],
        [8.625142, -1.2372284, 1.39301471, 4.3394457],
        [7.65803596, -2.21017215, 1.99175714, 3.71004654],
        [0.89319875, -5.37152317, 1.50313598, 1.95284886],
        [2.68362166, -5.78810913, -0.41233406, 1.94638989],
        [7.63541182, -1.99606076, 0.9241231, 4.53478238],
        [9.04699415, -0.74540679, 0.98042851, 5.99569071],
        [1.02552122, -5.73874278, -1.74804915, -0.07831216],
        [7.18135665, -3.49473178, 1.14300963, 4.46065816],
        [0.58812902, -4.66559815, -0.72831685, 1.40171779],
        [1.48620862, -5.9963108, 0.19145963, -1.11369256],
        [7.6625556, -1.21328083, 2.06361094, 6.2643551],
        [9.45050727, -1.36536078, 1.31154384, 3.89103468],
        [6.88203724, -1.62040255, 3.89961049, 2.12865388],
        [5.60842705, -2.10693356, 1.93328514, 3.90825432],
        [2.35150936, -6.62836131, -1.84278374, 0.51540886],
        [1.17446451, -5.62506058, -2.18420699, 1.21385128],
    ])

    y = np.array([0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0])

    request.cls.clusters = Dataset(X, y)
Exemple #4
0
def blobs4(request):
    """
    Creates a fixture of 400 instances in 4 clusters with 16 features.
    """
    X, y = make_blobs(
        centers=4, n_samples=400, n_features=16, shuffle=True, random_state=1212
    )
    request.cls.blobs4 = Dataset(X, y)
Exemple #5
0
def blobs12(request):
    """
    Creates a fixture of 1000 instances in 12 clusters with 16 features.
    """
    X, y = make_blobs(
        centers=12, n_samples=1000, n_features=16, shuffle=True, random_state=2121
    )
    request.cls.blobs12 = Dataset(X, y)
Exemple #6
0
def clusters(request):
    """
    Creates a random regression dataset fixture
    """
    X, y = make_blobs(n_samples=500, n_features=20, centers=3, random_state=743)

    dataset = Dataset(X, y)
    request.cls.clusters = dataset
def continuous(request):
    """
    Creates a random regressor fixture.
    """
    X, y = make_regression(
        n_samples=500, n_features=22, n_informative=8, random_state=2019
    )

    # Set a class attribute for continuous data
    request.cls.continuous = Dataset(X, y)
Exemple #8
0
def blobs(request):
    """
    Create a random blobs clustering dataset fixture
    """
    X, y = make_blobs(
        n_samples=1000, n_features=12, centers=6, shuffle=True, random_state=42
    )

    # Set a class attribute for blobs
    request.cls.blobs = Dataset(X, y)
def continuous(request):
    """
    Creates a random regression dataset fixture.
    """
    X, y = make_regression(n_samples=500,
                           n_features=22,
                           n_informative=8,
                           random_state=2019)

    # Dataset is accessible on the class so it is only generated once
    request.cls.continuous = Dataset(X, y)
Exemple #10
0
def digits(request):
    """
    Creates a fixture of train and test splits for the sklearn digits dataset
    For ease of use returns a Dataset named tuple composed of two Split tuples.
    """
    data = load_digits()
    X_train, X_test, y_train, y_test = tts(
        data.data, data.target, test_size=0.2, random_state=11
    )

    # Set a class attribute for digits
    request.cls.digits = Dataset(Split(X_train, X_test), Split(y_train, y_test))
def discrete(request):
    """
    Create a random classification dataset fixture.
    """
    X, y = make_classification(
        n_classes=5,
        n_samples=400,
        n_features=12,
        n_informative=10,
        n_redundant=0,
        random_state=2019,
    )

    # Dataset is accessible on the class so it is only generated once
    request.cls.discrete = Dataset(X, y)
Exemple #12
0
def regression(request):
    """
    Creates a random regression dataset fixture
    """
    X, y = make_regression(
        n_samples=500,
        n_features=20,
        n_informative=8,
        noise=0.01,
        bias=1.4,
        random_state=953,
    )

    dataset = Dataset(X, y)
    request.cls.regression = dataset
def data(request):
    """
    Creates a random regression fixture that has a R2 score below 0.85 and several
    outliers that best demonstrate the effectiveness of influence visualizers.
    """
    X, y = make_regression(
        n_samples=100,
        n_features=14,
        n_informative=6,
        bias=1.2,
        noise=49.8,
        tail_strength=0.6,
        random_state=637,
    )

    request.cls.data = Dataset(X, y)
Exemple #14
0
def classification(request):
    """
    Creates a random multiclass classification dataset fixture
    """
    X, y = make_classification(
        n_samples=500,
        n_features=20,
        n_informative=8,
        n_redundant=2,
        n_classes=3,
        n_clusters_per_class=3,
        random_state=3902,
    )

    dataset = Dataset(X, y)
    request.cls.classification = dataset
Exemple #15
0
def dataset(request):
    """
    Creates a multiclass classification dataset fixture for RFECV
    """
    X, y = make_classification(
        n_samples=300,
        n_features=5,
        n_informative=3,
        n_repeated=0,
        n_classes=4,
        n_clusters_per_class=1,
        random_state=0,
    )

    dataset = Dataset(X, y)
    request.cls.dataset = dataset
Exemple #16
0
def continuous(request):
    """
    Creates a random continuous regression dataset fixture
    """
    X, y = make_regression(
        n_samples=500,
        n_features=22,
        n_informative=8,
        random_state=42,
        noise=0.2,
        bias=0.2,
    )

    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=11)

    # Set a class attribute for regression
    request.cls.continuous = Dataset(Split(X_train, X_test), Split(y_train, y_test))
Exemple #17
0
def multiclass(request):
    """
    Creates a random multiclass classification dataset fixture
    """
    X, y = make_classification(
        n_samples=500,
        n_features=20,
        n_informative=8,
        n_redundant=2,
        n_classes=6,
        n_clusters_per_class=3,
        random_state=87,
    )

    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=93)

    dataset = Dataset(Split(X_train, X_test), Split(y_train, y_test))
    request.cls.multiclass = dataset
def discrete(request):
    """
    Creates a fixture of train and test splits for the sklearn digits dataset
    For ease of use returns a Dataset named tuple composed of two Split tuples.
    """
    X, y = make_classification(
        n_samples=400,
        n_features=12,
        n_informative=8,
        n_redundant=0,
        n_classes=5,
        n_clusters_per_class=1,
        class_sep=1.8,
        random_state=854,
        scale=[14.2, 2.1, 0.32, 0.001, 32.3, 44.1, 102.3, 2.3, 2.4, 38.2, 0.05, 1.0],
    )

    # Set a class attribute for discrete data.
    request.cls.discrete = Dataset(X, y)
def data(request):
    """
    Creates a fixture of train and test splits for the sklearn digits dataset
    For ease of use returns a Dataset named tuple composed of two Split tuples.
    """
    X, y = make_regression(
        n_samples=500,
        n_features=22,
        n_informative=8,
        random_state=42,
        noise=0.2,
        bias=0.2,
    )

    X_train, X_test, y_train, y_test = tts(X,
                                           y,
                                           test_size=0.2,
                                           random_state=11)

    # Set a class attribute for digits
    request.cls.data = Dataset(Split(X_train, X_test), Split(y_train, y_test))