Ejemplo n.º 1
0
    def test_filter(self):
        with_nans = pd.DataFrame(
            data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0],
                      Sepal_Width=[.75, .9, .8, .76],
                      Petal_Length=[np.nan, 2.5, 2.6, 2.4],
                      Petal_Width=[.8, .7, .9, 0.7]))

        tmpfile = 'tmpfile_with_nans.csv'
        with_nans.to_csv(tmpfile, index=False, na_rep='?')

        file_schema = 'sep=, col=Petal_Length:R4:0 col=Petal_Width:R4:1 ' \
                      'col=Sepal_Length:R4:2 col=Sepal_Width:R4:3 header+'
        data = FileDataStream(tmpfile, schema=file_schema)

        xf = Filter(columns=[
            'Petal_Length', 'Petal_Width', 'Sepal_Length', 'Sepal_Width'
        ])

        features = xf.fit_transform(data)

        assert features.shape == (2, 4)
        print(features.columns)
        # columns ordering changed between 0.22 and 0.23
        assert set(features.columns) == {
            'Petal_Length', 'Petal_Width', 'Sepal_Length', 'Sepal_Width'
        }
        os.remove(tmpfile)
    def test_inf(self):
        data = DataFrame(data=dict(f0=[np.inf, 1, 2, 3, 4, 5, 6],
                                   f1=[1, 2, -np.Infinity, 3, 4, 5, 6]))

        xf = Filter(columns=['f0'])
        filtered = xf.fit_transform(data)
        assert_equal(filtered['f0'][0], np.inf)
        assert_equal(filtered['f1'][2], -np.inf)
    def test_missing(self):
        data = DataFrame(data=dict(f0=[np.nan, 1, 2, 3, 4, 5, 6],
                                   f1=[1, 2, np.nan, 3, 4, 5, 6],
                                   f2=[np.nan, 1, np.nan, 2, 3, np.nan, 4]))

        for col in data.columns:
            xf = Filter(columns=[col])
            filtered = xf.fit_transform(data)
            count = [isinstance(x, str) or not isnan(x)
                     for x in data[col]].count(True)
            assert_equal(filtered.shape[0], count)
Ejemplo n.º 4
0
    def test_check_estimator_filter(self):
        dataTrain = pd.DataFrame(
            data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0],
                      Sepal_Width=[.75, .9, .8, .76],
                      Petal_Length=[np.nan, 2.5, 2.6, 2.4],
                      Petal_Width=[.8, .7, .9, 0.7],
                      Species=["setosa", "virginica", "", 'versicolor']))

        filter = Filter() << ["Sepal_Length", "Petal_Length"]
        data_idv = filter.fit_transform(dataTrain)
        assert data_idv is not None
        assert len(data_idv) > 0
Ejemplo n.º 5
0
###############################################################################
# Filter
import numpy as np
import pandas as pd
from nimbusml import FileDataStream
from nimbusml.preprocessing.missing_values import Filter

with_nans = pd.DataFrame(
    data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0],
              Sepal_Width=[.75, .9, .8, .76],
              Petal_Length=[np.nan, 2.5, 2.6, 2.4],
              Petal_Width=[.8, .7, .9, 0.7],
              Species=["setosa", "viginica", "", 'versicolor']))

# write NaNs to file to see how transforms work
tmpfile = 'tmpfile_with_nans.csv'
with_nans.to_csv(tmpfile, index=False)

# schema for reading directly from text files
schema = 'sep=, col=Petal_Length:R4:0 col=Petal_Width:R4:1' \
         'col=Sepal_Length:R4:2 col=Sepal_Width:R4:3 col=Species:TX:4 header+'
data = FileDataStream.read_csv(tmpfile)
print(data.schema)

# filter out rows where Sepal_Length is NaN
nafilter = Filter() << ['Sepal_Length', 'Petal_Length']

print(with_nans)
print('NAFilter\n', nafilter.fit_transform(data))
Ejemplo n.º 6
0
###############################################################################
# Filter
import numpy as np
import pandas as pd
from nimbusml import FileDataStream
from nimbusml.preprocessing.missing_values import Filter

with_nans = pd.DataFrame(data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0],
                                   Sepal_Width=[.75, .9, .8, .76],
                                   Petal_Length=[np.nan, 2.5, 2.6, 2.4],
                                   Petal_Width=[.8, .7, .9, 0.7]))

# write NaNs to file to show how this transform work
tmpfile = 'tmpfile_with_nans.csv'
with_nans.to_csv(tmpfile, index=False)

data = FileDataStream.read_csv(tmpfile, sep=',', numeric_dtype=np.float32)

# transform usage
xf = Filter(
    columns=['Petal_Length', 'Petal_Width', 'Sepal_Length', 'Sepal_Width'])

# fit and transform
features = xf.fit_transform(data)

# print features
print(features.head())
#    Petal_Length  Petal_Width  Sepal_Length  Sepal_Width
# 0           2.4          0.7           1.0         0.76