Exemple #1
0
def test_load_iris():
    iris = load_iris_df(include_tgt=False, names=['a', 'b', 'c', 'd'])
    assert isinstance(iris, pd.DataFrame)
    assert 'species' not in iris.columns
    assert iris.shape == (150, 4)

    # assert on the names
    assert 'a' in iris
Exemple #2
0
# Author: Taylor Smith <*****@*****.**>

from __future__ import print_function

import numpy as np
import pandas as pd

from skoot.datasets import load_iris_df
from skoot.utils.testing import assert_raises
from skoot.feature_selection import (FeatureFilter, SparseFeatureFilter,
                                     MultiCorrFilter, NearZeroVarianceFilter)

from numpy.testing import assert_array_equal, assert_array_almost_equal

# get some datasets defined for use later
iris = load_iris_df(include_tgt=False, names=['a', 'b', 'c', 'd'])

sparse = pd.DataFrame.from_records(data=[[1., 2., np.nan], [2., 3., np.nan],
                                         [np.nan, 4., 5.]],
                                   columns=['a', 'b', 'c'])


def test_nzv_constant_col():
    X = pd.DataFrame.from_records(data=np.array([[1, 2, 3], [4, 5, 3],
                                                 [6, 7, 3], [8, 9, 3]]),
                                  columns=['a', 'b', 'c'])

    flt = NearZeroVarianceFilter(freq_cut=25)
    trans = flt.fit_transform(X)

    # show that the output is one column shorter
Exemple #3
0
# -*- coding: utf-8 -*-

from __future__ import absolute_import

from skoot.exploration.multivariate import summarize
from skoot.datasets import load_iris_df

import numpy as np
from numpy.testing import assert_array_almost_equal, assert_array_equal

# used throughout
nan = np.nan
float_fields = ["a", "b", "c", "d"]

# load iris and add a string field
iris = load_iris_df(include_tgt=True, names=float_fields,
                    tgt_name="species")
iris["cls"] = ["A" if x == 0 else "B" if x == 1 else "C"
               for x in iris["species"]]


def test_summarize_all_continuous():
    cont = iris[float_fields]
    summary = summarize(cont)
    # show we get the stats we expect
    expected = [
        [5.843333, 3.054000, 3.758667, 1.198667],    # mean
        [5.800000, 3.000000, 4.350000, 1.300000],    # median
        [7.900000, 4.400000, 6.900000, 2.500000],    # max
        [4.300000, 2.000000, 1.000000, 0.100000],    # min
        [0.685694, 0.188004, 3.113179, 0.582414],    # variance
        [0.311753, 0.330703, -0.271712, -0.103944],  # skewness
Exemple #4
0
# -*- coding: utf-8 -*-

from __future__ import absolute_import

from skoot.datasets import load_iris_df
from skoot.preprocessing.schema import SchemaNormalizer
from skoot.utils.testing import assert_persistable

X = load_iris_df()
schema = {'petal width (cm)': int}


def test_normalizer():
    norm = SchemaNormalizer(schema).fit(X)
    trans = norm.transform(X)
    types = trans.dtypes

    assert types['petal width (cm)'].name.startswith("int"), types


def test_schema_persistable():
    assert_persistable(SchemaNormalizer(schema), "location.pkl", X)
Exemple #5
0
# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <*****@*****.**>

from __future__ import absolute_import

from sklearn.preprocessing import RobustScaler

from skoot.preprocessing import SelectiveScaler
from skoot.datasets import load_iris_df

from numpy.testing import assert_array_almost_equal
import numpy as np

X = load_iris_df(include_tgt=False)


def test_selective_scale():
    original = X
    cols = [original.columns[0]]  # Only perform on first...

    # original_means = np.mean(X, axis=0)
    #  array([ 5.84333333,  3.054     ,  3.75866667,  1.19866667])

    # original_std = np.std(X, axis=0)
    #  array([ 0.82530129,  0.43214658,  1.75852918,  0.76061262])

    transformer = SelectiveScaler(cols=cols).fit(original)
    transformed = transformer.transform(original)

    # expected: array([ 0.  ,  3.054     ,  3.75866667,  1.19866667])
Exemple #6
0
"""
=================
Example summarize
=================

Demonstrates how to use the ``summarize`` function to get a quick
summary of your dataset.

.. raw:: html

   <br/>
"""
print(__doc__)

# Author: Taylor Smith <*****@*****.**>

from skoot.exploration import summarize
from skoot.datasets import load_iris_df

# #############################################################################
# load data
iris = load_iris_df(include_tgt=True)

# add a feature of nothing but a single level of strings. This is to
# demonstrate that the summary will report on even uninformative features
iris["x5"] = "Level1"

# print the summary of the dataset
print(summarize(iris))
Exemple #7
0
# -*- coding: utf-8 -*-

from __future__ import absolute_import

from skoot.preprocessing import BinningTransformer
from skoot.datasets import load_iris_df
from skoot.utils.testing import assert_raises

import numpy as np
from numpy.testing import assert_array_equal

iris = load_iris_df(include_tgt=False, names=["a", "b", "c", "d"])


def test_binning_simple():
    binner = BinningTransformer(cols=["a"],
                                n_bins=3,
                                strategy="uniform",
                                return_bin_label=True,
                                overwrite=True)
    binner.fit(iris)
    trans = binner.transform(iris)

    # show the dfs are not the same
    assert trans is not iris

    # show the columns stayed the same, though
    assert trans.columns.tolist() == iris.columns.tolist()

    # show we have a string datatype now
    assert trans.dtypes['a'].name == 'object'
# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <*****@*****.**>

from __future__ import absolute_import

from skoot.utils.dataframe import get_numeric_columns
from skoot.datasets import load_iris_df

# get iris loaded
iris = load_iris_df(names=['a', 'b', 'c', 'd'], tgt_name='e')


def test_get_numeric():
    subset = get_numeric_columns(iris)
    assert subset.equals(iris)
    assert subset is not iris


def test_get_numeric_subset():
    df = iris.copy()
    df['e'] = df['e'].astype(str)
    subset = get_numeric_columns(df)
    assert subset.shape != df.shape
Exemple #9
0
#
# Author: Taylor Smith <*****@*****.**>

from __future__ import print_function, absolute_import, division

from numpy.testing import assert_array_almost_equal

from skoot.datasets import load_iris_df
from skoot.utils.testing import assert_transformer_asdf, assert_persistable
from skoot.decomposition import (SelectivePCA, SelectiveTruncatedSVD,
                                 SelectiveNMF, SelectiveKernelPCA,
                                 SelectiveIncrementalPCA)

# Def data for testing
names = ['a', 'b', 'c', 'd']
X = load_iris_df(include_tgt=False, names=names)


def test_selective_pca():
    # create a copy of the original
    original = X.copy()

    # set the columns we'll fit to just be the first
    cols = [names[0]]  # 'a'

    # the "other" names, and their corresponding matrix
    comp_column_names = names[1:]
    compare_cols = original[comp_column_names].as_matrix()

    # now fit PCA on the first column only
    transformer = SelectivePCA(cols=cols,
Exemple #10
0
   <br/>
"""
print(__doc__)

# Author: Taylor Smith <*****@*****.**>

# #############################################################################
# Introduce an interesting scenario
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from skoot.preprocessing import SelectiveStandardScaler
from skoot.base import make_transformer
from skoot.datasets import load_iris_df

X = load_iris_df(tgt_name="target")
y = X.pop('target')
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                    test_size=0.2)


# Let's say we want to scale our features with the StandardScaler, but
# for whatever reason we only want the ABSOLUTE value of the scaled values.
# We *could* create a transformer or split our pipeline, but either case is
# klunky and could interrupt our CV process in a grid search.
#
# So we'll instead define a simple commutative function that will be wrapped
# in an "anonymous" transformer
def make_abs(X):
Exemple #11
0
between scikit-learn and skoot.

.. raw:: html

   <br/>
"""
print(__doc__)

# Author: Taylor Smith <*****@*****.**>

# #############################################################################
# Skoot is laid out much like scikit-learn. That is, many of the same modules
# exist in skoot that are present in scikit. For example:
from skoot import decomposition
print(dir(decomposition))  # many are similar to sklearn classes
print("")

# #############################################################################
# Skoot also has a dataset interface, like sklearn. Except it returns
# dataframes rather than numpy arrays:
from skoot.datasets import load_iris_df
df = load_iris_df(include_tgt=True, tgt_name='Species')
print(df.head())
print("")

# #############################################################################
# All skoot transformers are based on the BasePDTransformer:
from skoot.base import BasePDTransformer

print(BasePDTransformer.__doc__)