Ejemplo n.º 1
0
    def test_schema_collapse_all(self):
        path = get_dataset('infert').as_filepath()

        file_schema = DataSchema.read_schema(path,
                                             collapse='all',
                                             sep=',',
                                             numeric_dtype=numpy.float32,
                                             names={
                                                 0: 'row_num',
                                                 5: 'case'
                                             })
        file_schema.rename('age', 'Features')
        assert str(
            file_schema) == "col=row_num:R4:0 col=education:TX:1 " \
                            "col=Features:R4:2-4,6-8 col=case:R4:5 " \
                            "header=+ sep=,"
Ejemplo n.º 2
0
    def test_data_schema_collapse_yes_file_loader(self):

        df = pandas.DataFrame(dict(tt=['a', 'b', 'cc', 'dd', 'ee']))
        df['ff'] = 0.2
        df['ff2'] = 0.1
        df['tt1'] = 'rt'
        df['ii'] = 5
        df['gg'] = 3.4
        st = StringIO()
        df.to_csv(st, index=False)
        st = StringIO(st.getvalue())
        sch = DataSchema.read_schema(st, collapse=True, tool='nimbusml')
        s = str(sch)
        self.assertEqual(
            s, 'col=ff:R8:1-2 col=gg:R8:5 col=ii:I8:4 col=tt:TX:0 '
            'col=tt1:TX:3 header=+')
Ejemplo n.º 3
0
    def test_data_schema_collapse_no(self):

        df = pandas.DataFrame(dict(tt=['a', 'b', 'cc', 'dd', 'ee']))
        df['ff'] = 0.2
        df['ff2'] = 0.1
        df['tt1'] = 'rt'
        df['ii'] = 5
        df['gg'] = 3.4
        st = StringIO()
        df.to_csv(st, index=False)
        st = StringIO(st.getvalue())
        df = pandas.read_csv(st)
        sch = DataSchema.read_schema(df)
        s = str(sch)
        self.assertEqual(
            s, 'col=tt:TX:0 col=ff:R8:1 col=ff2:R8:2 col=tt1:TX:3 '
            'col=ii:I8:4 col=gg:R8:5 header=+')
Ejemplo n.º 4
0
    def test_ensemble_supports_cv_without_user_defined_transforms(self):
        path = get_dataset("airquality").as_filepath()
        schema = DataSchema.read_schema(path)
        data = FileDataStream(path, schema)

        ind_args = {'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R'}
        handler_args = {'Solar_R': 'Solar_R', 'Ozone': 'Ozone'}
        lgbm_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'normalize': 'Yes'
        }
        ols_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'normalize': 'Yes'
        }
        ogd_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'shuffle': False,
            'normalize': 'Yes'
        }

        pipeline = Pipeline([
            Indicator() << ind_args,
            Handler(replace_with='Mean') << handler_args
        ])
        transformed_data = pipeline.fit_transform(data, as_binary_data_stream=True)

        pipeline_steps = [LightGbmRegressor(**lgbm_args)]
        cv_results = CV(pipeline_steps).fit(transformed_data)
        l2_avg_lgbm = cv_results['metrics_summary'].loc['Average', 'L2(avg)']

        r1 = OrdinaryLeastSquaresRegressor(**ols_args)
        r2 = OnlineGradientDescentRegressor(**ogd_args)
        r3 = LightGbmRegressor(**lgbm_args)

        pipeline_steps = [VotingRegressor(estimators=[r1, r2, r3], combiner='Average')]
        cv_results = CV(pipeline_steps).fit(transformed_data)
        l2_avg_ensemble = cv_results['metrics_summary'].loc['Average', 'L2(avg)']

        self.assertTrue(l2_avg_ensemble < l2_avg_lgbm)
Ejemplo n.º 5
0
 def test_schema_dtype_slice(self):
     path = get_dataset('gen_tickettrain').as_filepath()
     file_schema = DataSchema.read_schema(path,
                                          sep=',',
                                          collapse='all',
                                          names={
                                              0: 'Label',
                                              1: 'GroupId'
                                          },
                                          dtype={
                                              'GroupId': str,
                                              'Label': numpy.float32,
                                              'carrier': str,
                                              'price': numpy.float32
                                          })
     assert str(
         file_schema) == 'col=Label:R4:0 col=GroupId:TX:1 ' \
                         'col=carrier:TX:2 col=price:R4:3 ' \
                         'col=Class:I8:4-6 col=duration:R8:7 header=+ ' \
                         'sep=,'
Ejemplo n.º 6
0
    def test_split_start_with_transforms_with_presteps(self):
        path = get_dataset("airquality").as_filepath()
        schema = DataSchema.read_schema(path)
        data = FileDataStream(path, schema)

        pipeline_steps = [
            Indicator() << {
                'Ozone_ind': 'Ozone',
                'Solar_R_ind': 'Solar_R'
            },
            Handler(replace_with='Mean') << {
                'Solar_R': 'Solar_R',
                'Ozone': 'Ozone'
            },
            LightGbmRegressor(feature=[
                'Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'
            ],
                              label='Wind')
        ]

        results = CV(pipeline_steps).fit(data,
                                         split_start='after_transforms',
                                         dry_run=True)
        results = json.loads(results)

        node_names = [ep['Name'] for ep in results['nodes']]
        cv_node = [
            ep for ep in results['nodes']
            if 'Models.CrossValidator' in ep['Name']
        ][0]
        cv_sub_node_names = [ep['Name'] for ep in cv_node['Inputs']['Nodes']]

        self.assertTrue('Transforms.MissingValueHandler' in node_names)
        self.assertTrue(
            'Transforms.MissingValueHandler' not in cv_sub_node_names)
        self.assertTrue('Transforms.ModelCombiner' in node_names)
Ejemplo n.º 7
0
###############################################################################
# CV - cross-validate data
import numpy as np
from nimbusml import Pipeline, FileDataStream, DataSchema
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.categorical import OneHotVectorizer
from nimbusml.linear_model import LogisticRegressionClassifier, \
    FastLinearRegressor
from nimbusml.model_selection import CV
from nimbusml.preprocessing.missing_values import Indicator, Handler

# Case 1: Default usage of CV

path = get_dataset('infert').as_filepath()
schema = DataSchema.read_schema(path, numeric_dtype=np.float32)
data = FileDataStream.read_csv(path, schema=schema)

pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    LogisticRegressionClassifier(feature=['age', 'spontaneous', 'edu'],
                                 label='induced')
])

# Do 3-fold cross-validation
cv_results = CV(pipeline).fit(data, cv=3)

# print summary statistic of metrics
print(cv_results['metrics_summary'])

# print metrics for all folds
print(cv_results['metrics'])
Ejemplo n.º 8
0
 def test_schema_dtype_numpy_float(self):
     li = [[1.0, 1.0, 2.0], [3.0, 5.0, 6.0]]
     mat = numpy.array(li)
     schema = DataSchema.read_schema(mat)
     assert str(schema) == 'col=Data:R8:0-2 header=+'
Ejemplo n.º 9
0
 def test_schema_dtype_list_trueint(self):
     li = [[1, 1, 2], [3, 5, 6]]
     schema = DataSchema.read_schema(li)
     assert str(schema) == 'col=c0:I8:0 col=c1:I8:1 col=c2:I8:2 header=+'
Ejemplo n.º 10
0
 def test_schema_dtype_list_int(self):
     li = [[1.0, 1.0, 2.0], [3.0, 5.0, 6.0]]
     schema = DataSchema.read_schema(li)
     assert str(schema) == 'col=c0:R8:0 col=c1:R8:1 col=c2:R8:2 header=+'
Ejemplo n.º 11
0
    def test_schema_documentation(self):

        data = DataFrame(
            OrderedDict(real=[0.1, 0.2], integer=[1, 2], text=["a", "b"]))
        data['real32'] = data['real'].astype(numpy.float32)
        schema = DataSchema.read_schema(data)
        if sys.version_info[:2] >= (3, 6):
            assert str(
                schema) == 'col=real:R8:0 col=integer:I8:1 col=text:TX:2 ' \
                           'col=real32:R4:3 header=+'

        data = DataFrame(
            OrderedDict(real=[0.1, 0.2], integer=[1, 2], text=["a", "b"]))
        data.to_csv('data.txt', index=False)
        schema = DataSchema.read_schema('data.txt')
        if sys.version_info[:2] >= (3, 6):
            assert str(
                schema) == 'col=real:R8:0 col=integer:I8:1 col=text:TX:2' \
                           ' header=+'

        data = DataFrame(
            OrderedDict(real=[0.1, 0.2], integer=[1, 2], text=["a", "b"]))
        data.to_csv('data.txt', index=False)
        schema = DataSchema.read_schema('data.txt')
        if sys.version_info[:2] >= (3, 6):
            assert str(
                schema) == 'col=real:R8:0 col=integer:I8:1 col=text:TX:2' \
                           ' header=+'

        data = DataFrame(
            OrderedDict(real=[0.1, 0.2],
                        real2=[0.1, 0.2],
                        integer=[1, 2],
                        text=["a", "b"]))
        data.to_csv('data.txt', index=False)
        schema = DataSchema.read_schema('data.txt', collapse=True)
        if sys.version_info[:2] >= (3, 6):
            assert str(
                schema) == 'col=real:R8:0-1 col=integer:I8:2 ' \
                           'col=text:TX:3 header=+'

        data = DataFrame(
            OrderedDict(real=[0.1, 0.2], text1=["a", "b"], text2=["a", "b"]))
        data.to_csv('data.txt', index=False)
        schema = DataSchema.read_schema('data.txt',
                                        collapse=True,
                                        names={
                                            0: 'newname',
                                            1: 'newname2'
                                        })
        if sys.version_info[:2] >= (3, 6):
            assert str(
                schema) == 'col=newname:R8:0 col=newname2:TX:1-2 header=+'

        data = DataFrame(
            OrderedDict(real=[0.1, 0.2], text1=["a", "b"], text2=["a", "b"]))
        data.to_csv('data.txt', index=False)
        schema = DataSchema.read_schema('data.txt',
                                        collapse=False,
                                        names={(1, None): 'text'})
        if sys.version_info[:2] >= (3, 6):
            assert str(
                schema) == 'col=real:R8:0 col=text_0:TX:1 ' \
                           'col=text_1:TX:2 header=+'

        data = DataFrame(OrderedDict(real=[0.1, 0.2], text1=["a", "b"]))
        data.to_csv('data.txt', index=False)
        schema = DataSchema.read_schema('data.txt',
                                        collapse=True,
                                        dtype={'real': numpy.float32})
        if sys.version_info[:2] >= (3, 6):
            assert str(schema) == 'col=real:R4:0 col=text1:TX:1 header=+'
        for c in schema:
            assert repr(c).startswith("DataColumn(name='")
        assert repr(schema).startswith("DataSchema([DataColumn(name='")
Ejemplo n.º 12
0
###############################################################################
# CharTokenizer
import numpy
from nimbusml import FileDataStream, DataSchema, Pipeline
from nimbusml.datasets import get_dataset
from nimbusml.preprocessing import FromKey
from nimbusml.preprocessing.text import CharTokenizer
from nimbusml.preprocessing.schema import ColumnSelector
from nimbusml.feature_extraction.text import WordEmbedding

# data input (as a FileDataStream)
path = get_dataset('wiki_detox_train').as_filepath()

file_schema = DataSchema.read_schema(
    path, sep='\t', numeric_dtype=numpy.float32)
data = FileDataStream(path, schema=file_schema)
print(data.head())

#    Sentiment                                      SentimentText
# 0        1.0  ==RUDE== Dude, you are rude upload that carl p...
# 1        1.0  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIK...
# 2        1.0  Stop trolling, zapatancas, calling me a liar m...
# 3        1.0  ==You're cool==  You seem like a really cool g...
# 4        1.0  ::::: Why are you threatening me? I'm not bein...

# After using Character Tokenizer, it will convert the vector of Char to Key type.
# Use FromKey to retrieve the data from Key first, then send into WordEmbedding.

pipe = Pipeline([
        CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}),
        FromKey(columns={'SentimentText_FromKey': 'SentimentText_Transform'}),