Example #1
0
 def test_check_estimator_ColumnDuplicator_pairs(self):
     df = pandas.DataFrame(
         data=dict(tokens1=['one_' + str(i) for i in range(8)],
                   tokens2=['two_' + str(i) for i in range(8)]))
     cd = ColumnDuplicator() << {'tokens3': 'tokens1', 'tokens4': 'tokens1'}
     y = cd.fit_transform(df)
     sum = 0
     for v in y.values:
         for c in str(v):
             sum = sum + ord(c)
     assert_equal(sum, 19920, "sum of chars should be %s" % 19920)
    def test_averagedperceptron_unsupported_losses_syntax(self):
        df = get_dataset("infert").as_df().drop('row_num', axis=1)
        X = df
        y = df['case']

        pipeline = Pipeline([
            OneHotVectorizer(columns={
                'age1': 'age',
                'parity1': 'parity',
                'sp1': 'spontaneous'
            }),
            OneHotVectorizer(columns={'education_str': 'education_str'}),
            ColumnDuplicator(columns={'case2': 'case'}),
            AveragedPerceptronBinaryClassifier(
                feature=['age1', 'education_str'], label='case')
        ])

        try:
            model = pipeline.fit(X, y, verbose=0)
            raise AssertionError("same column name in X and y")
        except RuntimeError as e:
            assert "If any step in the pipeline has defined Label" in str(e)
        X = X.drop('case', axis=1)

        pipeline = Pipeline([
            OneHotVectorizer(columns={
                'age1': 'age',
                'parity1': 'parity',
                'sp1': 'spontaneous'
            }),
            OneHotVectorizer(columns={'education_str': 'education_str'}),
            # ColumnDuplicator(columns={'case2': 'case'}), # does not work
            AveragedPerceptronBinaryClassifier(
                feature=['age1', 'education_str'], label='case')
        ])

        info = pipeline.get_fit_info(df)[0]
        assert info[-1]['inputs'] != ['Feature:Features', 'Label:case']

        model = pipeline.fit(df)
        y_pred_withpipeline = model.predict(X)
        assert set(y_pred_withpipeline.columns) == {
            'PredictedLabel', 'Probability', 'Score'
        }
        assert y_pred_withpipeline.shape == (248, 3)
Example #3
0
    'ColumnSelector'
}

INSTANCES = {
    'AveragedPerceptronBinaryClassifier': AveragedPerceptronBinaryClassifier(
        feature=['education_str.0-5yrs', 'education_str.6-11yrs', 'education_str.12+ yrs']),
    'Binner': Binner(num_bins=3),
    'CharTokenizer': CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}),
    'ColumnConcatenator': ColumnConcatenator(columns={'Features': [
        'Sepal_Length',
        'Sepal_Width',
        'Petal_Length',
        'Petal_Width',
        'Setosa']}),
    'ColumnSelector': ColumnSelector(columns=['Sepal_Width', 'Sepal_Length']),
    'ColumnDuplicator': ColumnDuplicator(columns={'dup': 'Sepal_Width'}),
    'CountSelector': CountSelector(count=5, columns=['Sepal_Width']),
    'DateTimeSplitter': DateTimeSplitter(prefix='dt'),
    'FastForestBinaryClassifier': FastForestBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
                                                             label='Setosa'),
    'FastLinearBinaryClassifier': FastLinearBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
                                                             label='Setosa'),
    'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'),
    'Filter': Filter(columns=[ 'Petal_Length', 'Petal_Width']),
    'FromKey': Pipeline([
        ToKey(columns=['Sepal_Length']),
        FromKey(columns=['Sepal_Length'])
    ]),
    # GlobalContrastRowScaler currently requires a vector input to work
    'GlobalContrastRowScaler': Pipeline([
        ColumnConcatenator() << {
Example #4
0
###############################################################################
# ColumnDuplicator
import pandas
from nimbusml.preprocessing.schema import ColumnDuplicator

df = pandas.DataFrame(data=dict(tokens1=['one_' + str(i) for i in range(8)],
                                tokens2=['two_' + str(i) for i in range(8)]))

# duplicate a column
cd = ColumnDuplicator() << {'tokens3': 'tokens1'}
y = cd.fit_transform(df)

# view the three columns
print(y)
Example #5
0
###############################################################################
# ColumnDuplicator
from nimbusml import FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.preprocessing.schema import ColumnDuplicator

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path, sep=',')

# transform usage
xf = ColumnDuplicator(
    columns={
        'education_copy': 'education',
        'age_copy': 'age'})

# fit and transform
features = xf.fit_transform(data)

# print features
print(features.head())
#   age  age_copy  case education education_copy  induced  parity  ...
# 0   26        26     1    0-5yrs         0-5yrs        1       6  ...
# 1   42        42     1    0-5yrs         0-5yrs        1       1  ...
# 2   39        39     1    0-5yrs         0-5yrs        2       6  ...
# 3   34        34     1    0-5yrs         0-5yrs        2       4  ...
# 4   35        35     1   6-11yrs        6-11yrs        1       3  ...