def make_fixture(binary=False, balanced=False, split=False): """ Make a dataset for testing ClassBalance based on the specified params. """ kwargs = { "n_samples": 100, "n_features": 20, "n_informative": 8, "n_redundant": 2, "n_clusters_per_class": 1, "random_state": 89092, } if binary: kwargs["n_classes"] = 2 kwargs["weights"] = None if balanced else [0.3, 0.7] else: kwargs["n_classes"] = 5 kwargs["weights"] = None if balanced else [0.1, 0.2, 0.4, 0.2, 0.01] X, y = make_classification(**kwargs) if split: X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=101) return Dataset(Split(X_train, X_test), Split(y_train, y_test)) return Dataset(X, y)
def s_curves(request): """ Creates a random regressor fixture. """ X, y = make_s_curve(1000, random_state=888) # Set a class attribute for continuous data request.cls.s_curves = Dataset(X, y)
def clusters(request): # TODO: replace with make_blobs X = np.array([ [-0.40020753, -4.67055317, -0.27191127, -1.49156318], [0.37143349, -4.89391622, -1.23893945, 0.48318165], [8.625142, -1.2372284, 1.39301471, 4.3394457], [7.65803596, -2.21017215, 1.99175714, 3.71004654], [0.89319875, -5.37152317, 1.50313598, 1.95284886], [2.68362166, -5.78810913, -0.41233406, 1.94638989], [7.63541182, -1.99606076, 0.9241231, 4.53478238], [9.04699415, -0.74540679, 0.98042851, 5.99569071], [1.02552122, -5.73874278, -1.74804915, -0.07831216], [7.18135665, -3.49473178, 1.14300963, 4.46065816], [0.58812902, -4.66559815, -0.72831685, 1.40171779], [1.48620862, -5.9963108, 0.19145963, -1.11369256], [7.6625556, -1.21328083, 2.06361094, 6.2643551], [9.45050727, -1.36536078, 1.31154384, 3.89103468], [6.88203724, -1.62040255, 3.89961049, 2.12865388], [5.60842705, -2.10693356, 1.93328514, 3.90825432], [2.35150936, -6.62836131, -1.84278374, 0.51540886], [1.17446451, -5.62506058, -2.18420699, 1.21385128], ]) y = np.array([0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0]) request.cls.clusters = Dataset(X, y)
def blobs4(request): """ Creates a fixture of 400 instances in 4 clusters with 16 features. """ X, y = make_blobs( centers=4, n_samples=400, n_features=16, shuffle=True, random_state=1212 ) request.cls.blobs4 = Dataset(X, y)
def blobs12(request): """ Creates a fixture of 1000 instances in 12 clusters with 16 features. """ X, y = make_blobs( centers=12, n_samples=1000, n_features=16, shuffle=True, random_state=2121 ) request.cls.blobs12 = Dataset(X, y)
def clusters(request): """ Creates a random regression dataset fixture """ X, y = make_blobs(n_samples=500, n_features=20, centers=3, random_state=743) dataset = Dataset(X, y) request.cls.clusters = dataset
def continuous(request): """ Creates a random regressor fixture. """ X, y = make_regression( n_samples=500, n_features=22, n_informative=8, random_state=2019 ) # Set a class attribute for continuous data request.cls.continuous = Dataset(X, y)
def blobs(request): """ Create a random blobs clustering dataset fixture """ X, y = make_blobs( n_samples=1000, n_features=12, centers=6, shuffle=True, random_state=42 ) # Set a class attribute for blobs request.cls.blobs = Dataset(X, y)
def continuous(request): """ Creates a random regression dataset fixture. """ X, y = make_regression(n_samples=500, n_features=22, n_informative=8, random_state=2019) # Dataset is accessible on the class so it is only generated once request.cls.continuous = Dataset(X, y)
def digits(request): """ Creates a fixture of train and test splits for the sklearn digits dataset For ease of use returns a Dataset named tuple composed of two Split tuples. """ data = load_digits() X_train, X_test, y_train, y_test = tts( data.data, data.target, test_size=0.2, random_state=11 ) # Set a class attribute for digits request.cls.digits = Dataset(Split(X_train, X_test), Split(y_train, y_test))
def discrete(request): """ Create a random classification dataset fixture. """ X, y = make_classification( n_classes=5, n_samples=400, n_features=12, n_informative=10, n_redundant=0, random_state=2019, ) # Dataset is accessible on the class so it is only generated once request.cls.discrete = Dataset(X, y)
def regression(request): """ Creates a random regression dataset fixture """ X, y = make_regression( n_samples=500, n_features=20, n_informative=8, noise=0.01, bias=1.4, random_state=953, ) dataset = Dataset(X, y) request.cls.regression = dataset
def data(request): """ Creates a random regression fixture that has a R2 score below 0.85 and several outliers that best demonstrate the effectiveness of influence visualizers. """ X, y = make_regression( n_samples=100, n_features=14, n_informative=6, bias=1.2, noise=49.8, tail_strength=0.6, random_state=637, ) request.cls.data = Dataset(X, y)
def classification(request): """ Creates a random multiclass classification dataset fixture """ X, y = make_classification( n_samples=500, n_features=20, n_informative=8, n_redundant=2, n_classes=3, n_clusters_per_class=3, random_state=3902, ) dataset = Dataset(X, y) request.cls.classification = dataset
def dataset(request): """ Creates a multiclass classification dataset fixture for RFECV """ X, y = make_classification( n_samples=300, n_features=5, n_informative=3, n_repeated=0, n_classes=4, n_clusters_per_class=1, random_state=0, ) dataset = Dataset(X, y) request.cls.dataset = dataset
def continuous(request): """ Creates a random continuous regression dataset fixture """ X, y = make_regression( n_samples=500, n_features=22, n_informative=8, random_state=42, noise=0.2, bias=0.2, ) X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=11) # Set a class attribute for regression request.cls.continuous = Dataset(Split(X_train, X_test), Split(y_train, y_test))
def multiclass(request): """ Creates a random multiclass classification dataset fixture """ X, y = make_classification( n_samples=500, n_features=20, n_informative=8, n_redundant=2, n_classes=6, n_clusters_per_class=3, random_state=87, ) X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=93) dataset = Dataset(Split(X_train, X_test), Split(y_train, y_test)) request.cls.multiclass = dataset
def discrete(request): """ Creates a fixture of train and test splits for the sklearn digits dataset For ease of use returns a Dataset named tuple composed of two Split tuples. """ X, y = make_classification( n_samples=400, n_features=12, n_informative=8, n_redundant=0, n_classes=5, n_clusters_per_class=1, class_sep=1.8, random_state=854, scale=[14.2, 2.1, 0.32, 0.001, 32.3, 44.1, 102.3, 2.3, 2.4, 38.2, 0.05, 1.0], ) # Set a class attribute for discrete data. request.cls.discrete = Dataset(X, y)
def data(request): """ Creates a fixture of train and test splits for the sklearn digits dataset For ease of use returns a Dataset named tuple composed of two Split tuples. """ X, y = make_regression( n_samples=500, n_features=22, n_informative=8, random_state=42, noise=0.2, bias=0.2, ) X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=11) # Set a class attribute for digits request.cls.data = Dataset(Split(X_train, X_test), Split(y_train, y_test))