コード例 #1
0
ファイル: test_data_schema.py プロジェクト: zyw400/NimbusML-1
 def test_data_schema_read_schema_tab(self):
     df = pandas.DataFrame(
         dict(a=[0, 1], b=[0.1, 1.1], c=['r', 'd'], d=[False, True]))
     sch = DataSchema.read_schema(df)
     assert str(
         sch) == 'col=a:I8:0 col=b:R8:1 col=c:TX:2 col=d:BL:3 header=+'
     sch = DataSchema.read_schema(df, sep='\t')
     assert str(
         sch) == 'col=a:I8:0 col=b:R8:1 col=c:TX:2 col=d:BL:3 ' \
                 'header=+ sep=tab'
コード例 #2
0
ファイル: test_data_schema.py プロジェクト: zyw400/NimbusML-1
 def test_schema_dtype_regex(self):
     path = get_dataset('gen_tickettrain').as_filepath()
     file_schema = DataSchema.read_schema(path,
                                          collapse='all',
                                          sep=',',
                                          names={
                                              0: 'Label',
                                              1: 'GroupId',
                                              2: 'carrier',
                                              (3, None): 'Features'
                                          },
                                          dtype={
                                              'GroupId':
                                              str,
                                              'Label':
                                              numpy.float32,
                                              'carrier':
                                              str,
                                              'Features_[0-9]{1,2}':
                                              numpy.float32
                                          })
     file_schema.rename('Features_0', 'Features')
     assert str(
         file_schema) == 'col=Label:R4:0 col=GroupId:TX:1 ' \
                         'col=carrier:TX:2 col=Features:R4:3-7 ' \
                         'header=+ sep=,'
コード例 #3
0
ファイル: test_data_schema.py プロジェクト: zyw400/NimbusML-1
 def test_data_schema_read_schema(self):
     df = pandas.DataFrame(
         dict(a=[0, 1], b=[0.1, 1.1], c=['r', 'd'], d=[False, True]))
     sch = DataSchema.read_schema(df)
     assert str(
         sch) == 'col=a:I8:0 col=b:R8:1 col=c:TX:2 col=d:BL:3 header=+'
     sch = DataSchema.read_schema(df, sep=',')
     assert str(
         sch) == 'col=a:I8:0 col=b:R8:1 col=c:TX:2 col=d:BL:3 ' \
                 'header=+ sep=,'
     csr = csr_matrix([[0, 1], [1, 0]], dtype='int32')
     sch = DataSchema.read_schema(csr, sep=',')
     assert str(sch) == 'col=Data:I4:0-1 header=+ sep=,'
     csr = matrix([[0, 1], [1, 0]], dtype='int32')
     sch = DataSchema.read_schema(csr, sep=',')
     assert str(sch) == 'col=Data:I4:0-1 header=+ sep=,'
     csr = matrix([[0, 1], [1.5, 0.5]])
     sch = DataSchema.read_schema(csr, sep=',')
     assert str(sch) == 'col=Data:R8:0-1 header=+ sep=,'
コード例 #4
0
ファイル: test_cv.py プロジェクト: zyw400/NimbusML-1
 def test_defaults(self):
     schema = DataSchema.read_schema(infert_file, numeric_dtype=np.float32)
     data = FileDataStream.read_csv(infert_file, schema=schema)
     pipeline_steps = [
         OneHotVectorizer(columns={'edu': 'education'}),
         KMeansPlusPlus(
             n_clusters=5,
             feature=['edu', 'age', 'parity', 'spontaneous', 'stratum'])
     ]
     check_cv(pipeline_steps, data)
コード例 #5
0
ファイル: test_data_schema.py プロジェクト: zyw400/NimbusML-1
 def test_schema_airquality(self):
     train_file = get_dataset("airquality").as_filepath()
     found = DataSchema.read_schema(train_file)
     schema = "col=Unnamed0:I8:0 col=Ozone:R8:1 col=Solar_R:R8:2 " \
              "col=Wind:R8:3 col=Temp:I8:4 col=Month:I8:5 " \
              "col=Day:I8:6 header=+"
     assert str(found) == schema
     fds = FileDataStream(train_file, schema)
     assert str(fds.schema) == schema
     fds = FileDataStream.read_csv(train_file)
     assert str(fds.schema) == schema
コード例 #6
0
ファイル: test_data_schema.py プロジェクト: zyw400/NimbusML-1
 def test_schema_infert_R4(self):
     train_file = get_dataset("infert").as_filepath()
     found = DataSchema.read_schema(train_file, numeric_dtype=numpy.float32)
     schema = "col=row_num:R4:0 col=education:TX:1 col=age:R4:2 " \
              "col=parity:R4:3 col=induced:R4:4 " + \
              "col=case:R4:5 col=spontaneous:R4:6 col=stratum:R4:7 " \
              "col=pooled.stratum:R4:8 header=+"
     assert str(found) == schema
     fds = FileDataStream(train_file, schema)
     assert str(fds.schema) == schema
     fds = FileDataStream.read_csv(train_file, numeric_dtype=numpy.float32)
     assert str(fds.schema) == schema
コード例 #7
0
ファイル: test_data_schema.py プロジェクト: zyw400/NimbusML-1
 def test_schema_infert(self):
     train_file = get_dataset("infert").as_filepath()
     found = DataSchema.read_schema(train_file)
     schema = "col=row_num:I8:0 col=education:TX:1 col=age:I8:2 " \
              "col=parity:I8:3 col=induced:I8:4 " + \
              "col=case:I8:5 col=spontaneous:I8:6 col=stratum:I8:7 " \
              "col=pooled.stratum:I8:8 header=+"
     assert str(found) == schema
     fds = FileDataStream(train_file, schema)
     assert str(fds.schema) == schema
     fds = FileDataStream.read_csv(train_file)
     assert str(fds.schema) == schema
コード例 #8
0
ファイル: test_data_schema.py プロジェクト: zyw400/NimbusML-1
 def test_schema_dtype_numpy_trueint(self):
     li = [[1, 1, 2], [3, 5, 6]]
     mat = numpy.array(li)
     dt = mat.dtype
     schema = DataSchema.read_schema(mat)
     # The behavior is not the same on every OS.
     if dt == numpy.int64:
         assert str(schema) == 'col=Data:I8:0-2 header=+'
     elif dt == numpy.int32:
         assert str(schema) == 'col=Data:I4:0-2 header=+'
     else:
         raise TypeError("unexpected type {0}".format(dt))
コード例 #9
0
ファイル: test_data_stream.py プロジェクト: zyw400/NimbusML-1
    def test_data_header_no_dataframe(self):
        li = [1.0, 1.0, 2.0]
        df = pandas.DataFrame(li)
        schema0 = DataSchema.read_schema(df)
        assert str(schema0) == 'col=c0:R8:0 header=+'

        li = [[1.0, 1.0, 2.0], [3.0, 5.0, 6.0]]
        schema1 = DataSchema.read_schema(li)
        assert str(schema1) == 'col=c0:R8:0 col=c1:R8:1 col=c2:R8:2 header=+'

        df = pandas.DataFrame([[1.0, 1.0, 2.0], [3.0, 5.0, 6.0]])
        schema2 = DataSchema.read_schema(df)
        assert str(schema2) == 'col=c0:R8:0 col=c1:R8:1 col=c2:R8:2 header=+'

        mat = numpy.array([[1.0, 1.0, 2.0], [3.0, 5.0, 6.0]])
        schema3 = DataSchema.read_schema(mat)
        assert str(schema3) == 'col=Data:R8:0-2 header=+'

        li = [1.0, 1.0, 2.0]
        df = pandas.DataFrame(li)
        schema0 = DataSchema.read_schema(df, header=False)
        assert str(schema0) == 'col=c0:R8:0 header=-'
コード例 #10
0
    def test_ensemble_supports_cv_with_user_defined_transforms(self):
        path = get_dataset("airquality").as_filepath()
        schema = DataSchema.read_schema(path)
        data = FileDataStream(path, schema)

        ind_args = {'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R'}
        handler_args = {'Solar_R': 'Solar_R', 'Ozone': 'Ozone'}
        lgbm_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'normalize': 'Yes'
        }
        ols_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'normalize': 'Yes'
        }
        ogd_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'shuffle': False,
            'normalize': 'Yes'
        }

        for split_start in ['before_transforms', 'after_transforms']:
            pipeline_steps = [
                Indicator() << ind_args,
                Handler(replace_with='Mean') << handler_args,
                LightGbmRegressor(**lgbm_args)
            ]

            cv_results = CV(pipeline_steps).fit(data, split_start=split_start)
            l2_avg_lgbm = cv_results['metrics_summary'].loc['Average', 'L2(avg)']

            r1 = OrdinaryLeastSquaresRegressor(**ols_args)
            r2 = OnlineGradientDescentRegressor(**ogd_args)
            r3 = LightGbmRegressor(**lgbm_args)

            data = FileDataStream(path, schema)
            pipeline_steps = [
                Indicator() << ind_args,
                Handler(replace_with='Mean') << handler_args,
                VotingRegressor(estimators=[r1, r2, r3], combiner='Average')
            ]

            cv_results = CV(pipeline_steps).fit(data, split_start=split_start)
            l2_avg_ensemble = cv_results['metrics_summary'].loc['Average', 'L2(avg)']

            self.assertTrue(l2_avg_ensemble < l2_avg_lgbm)
コード例 #11
0
ファイル: test_data_schema.py プロジェクト: zyw400/NimbusML-1
    def test_data_schema_collapse_no_file(self):

        df = pandas.DataFrame(dict(tt=['a', 'b', 'cc', 'dd', 'ee']))
        df['ff'] = 0.2
        df['ff2'] = 0.1
        df['tt1'] = 'rt'
        df['ii'] = 5
        df['gg'] = 3.4
        st = StringIO()
        df.to_csv(st, index=False)
        st = StringIO(st.getvalue())
        sch = DataSchema.read_schema(st)
        s = str(sch)
        self.assertEqual(
            s, 'col=tt:TX:0 col=ff:R8:1 col=ff2:R8:2 col=tt1:TX:3 '
            'col=ii:I8:4 col=gg:R8:5 header=+')
コード例 #12
0
ファイル: test_data_schema.py プロジェクト: zyw400/NimbusML-1
    def test_schema_collapse_all(self):
        path = get_dataset('infert').as_filepath()

        file_schema = DataSchema.read_schema(path,
                                             collapse='all',
                                             sep=',',
                                             numeric_dtype=numpy.float32,
                                             names={
                                                 0: 'row_num',
                                                 5: 'case'
                                             })
        file_schema.rename('age', 'Features')
        assert str(
            file_schema) == "col=row_num:R4:0 col=education:TX:1 " \
                            "col=Features:R4:2-4,6-8 col=case:R4:5 " \
                            "header=+ sep=,"
コード例 #13
0
ファイル: test_data_schema.py プロジェクト: zyw400/NimbusML-1
 def test_schema_dtype_slice(self):
     path = get_dataset('gen_tickettrain').as_filepath()
     file_schema = DataSchema.read_schema(path,
                                          sep=',',
                                          collapse='all',
                                          names={
                                              0: 'Label',
                                              1: 'GroupId'
                                          },
                                          dtype={
                                              'GroupId': str,
                                              'Label': numpy.float32,
                                              'carrier': str,
                                              'price': numpy.float32
                                          })
     assert str(
         file_schema) == 'col=Label:R4:0 col=GroupId:TX:1 ' \
                         'col=carrier:TX:2 col=price:R4:3 ' \
                         'col=Class:I8:4-6 col=duration:R8:7 header=+ ' \
                         'sep=,'
コード例 #14
0
    def test_split_start_with_transforms_with_presteps(self):
        path = get_dataset("airquality").as_filepath()
        schema = DataSchema.read_schema(path)
        data = FileDataStream(path, schema)

        pipeline_steps = [
            Indicator() << {
                'Ozone_ind': 'Ozone',
                'Solar_R_ind': 'Solar_R'
            },
            Handler(replace_with='Mean') << {
                'Solar_R': 'Solar_R',
                'Ozone': 'Ozone'
            },
            LightGbmRegressor(feature=[
                'Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'
            ],
                              label='Wind')
        ]

        results = CV(pipeline_steps).fit(data,
                                         split_start='after_transforms',
                                         dry_run=True)
        results = json.loads(results)

        node_names = [ep['Name'] for ep in results['nodes']]
        cv_node = [
            ep for ep in results['nodes']
            if 'Models.CrossValidator' in ep['Name']
        ][0]
        cv_sub_node_names = [ep['Name'] for ep in cv_node['Inputs']['Nodes']]

        self.assertTrue('Transforms.MissingValueHandler' in node_names)
        self.assertTrue(
            'Transforms.MissingValueHandler' not in cv_sub_node_names)
        self.assertTrue('Transforms.ModelCombiner' in node_names)
コード例 #15
0
ファイル: test_data_schema.py プロジェクト: zyw400/NimbusML-1
    def test_schema_documentation(self):

        data = DataFrame(
            OrderedDict(real=[0.1, 0.2], integer=[1, 2], text=["a", "b"]))
        data['real32'] = data['real'].astype(numpy.float32)
        schema = DataSchema.read_schema(data)
        if sys.version_info[:2] >= (3, 6):
            assert str(
                schema) == 'col=real:R8:0 col=integer:I8:1 col=text:TX:2 ' \
                           'col=real32:R4:3 header=+'

        data = DataFrame(
            OrderedDict(real=[0.1, 0.2], integer=[1, 2], text=["a", "b"]))
        data.to_csv('data.txt', index=False)
        schema = DataSchema.read_schema('data.txt')
        if sys.version_info[:2] >= (3, 6):
            assert str(
                schema) == 'col=real:R8:0 col=integer:I8:1 col=text:TX:2' \
                           ' header=+'

        data = DataFrame(
            OrderedDict(real=[0.1, 0.2], integer=[1, 2], text=["a", "b"]))
        data.to_csv('data.txt', index=False)
        schema = DataSchema.read_schema('data.txt')
        if sys.version_info[:2] >= (3, 6):
            assert str(
                schema) == 'col=real:R8:0 col=integer:I8:1 col=text:TX:2' \
                           ' header=+'

        data = DataFrame(
            OrderedDict(real=[0.1, 0.2],
                        real2=[0.1, 0.2],
                        integer=[1, 2],
                        text=["a", "b"]))
        data.to_csv('data.txt', index=False)
        schema = DataSchema.read_schema('data.txt', collapse=True)
        if sys.version_info[:2] >= (3, 6):
            assert str(
                schema) == 'col=real:R8:0-1 col=integer:I8:2 ' \
                           'col=text:TX:3 header=+'

        data = DataFrame(
            OrderedDict(real=[0.1, 0.2], text1=["a", "b"], text2=["a", "b"]))
        data.to_csv('data.txt', index=False)
        schema = DataSchema.read_schema('data.txt',
                                        collapse=True,
                                        names={
                                            0: 'newname',
                                            1: 'newname2'
                                        })
        if sys.version_info[:2] >= (3, 6):
            assert str(
                schema) == 'col=newname:R8:0 col=newname2:TX:1-2 header=+'

        data = DataFrame(
            OrderedDict(real=[0.1, 0.2], text1=["a", "b"], text2=["a", "b"]))
        data.to_csv('data.txt', index=False)
        schema = DataSchema.read_schema('data.txt',
                                        collapse=False,
                                        names={(1, None): 'text'})
        if sys.version_info[:2] >= (3, 6):
            assert str(
                schema) == 'col=real:R8:0 col=text_0:TX:1 ' \
                           'col=text_1:TX:2 header=+'

        data = DataFrame(OrderedDict(real=[0.1, 0.2], text1=["a", "b"]))
        data.to_csv('data.txt', index=False)
        schema = DataSchema.read_schema('data.txt',
                                        collapse=True,
                                        dtype={'real': numpy.float32})
        if sys.version_info[:2] >= (3, 6):
            assert str(schema) == 'col=real:R4:0 col=text1:TX:1 header=+'
        for c in schema:
            assert repr(c).startswith("DataColumn(name='")
        assert repr(schema).startswith("DataSchema([DataColumn(name='")
コード例 #16
0
ファイル: CV.py プロジェクト: zyw400/NimbusML-1
###############################################################################
# CV - cross-validate data
import numpy as np
from nimbusml import Pipeline, FileDataStream, DataSchema
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.categorical import OneHotVectorizer
from nimbusml.linear_model import LogisticRegressionClassifier, \
    FastLinearRegressor
from nimbusml.model_selection import CV
from nimbusml.preprocessing.missing_values import Indicator, Handler

# Case 1: Default usage of CV

path = get_dataset('infert').as_filepath()
schema = DataSchema.read_schema(path, numeric_dtype=np.float32)
data = FileDataStream.read_csv(path, schema=schema)

pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    LogisticRegressionClassifier(feature=['age', 'spontaneous', 'edu'],
                                 label='induced')
])

# Do 3-fold cross-validation
cv_results = CV(pipeline).fit(data, cv=3)

# print summary statistic of metrics
print(cv_results['metrics_summary'])

# print metrics for all folds
print(cv_results['metrics'])
コード例 #17
0
ファイル: test_data_schema.py プロジェクト: zyw400/NimbusML-1
 def test_schema_dtype_numpy_float(self):
     li = [[1.0, 1.0, 2.0], [3.0, 5.0, 6.0]]
     mat = numpy.array(li)
     schema = DataSchema.read_schema(mat)
     assert str(schema) == 'col=Data:R8:0-2 header=+'
コード例 #18
0
###############################################################################
# CharTokenizer
import numpy
from nimbusml import FileDataStream, DataSchema, Pipeline
from nimbusml.datasets import get_dataset
from nimbusml.preprocessing import FromKey
from nimbusml.preprocessing.text import CharTokenizer
from nimbusml.preprocessing.schema import ColumnSelector
from nimbusml.feature_extraction.text import WordEmbedding

# data input (as a FileDataStream)
path = get_dataset('wiki_detox_train').as_filepath()

file_schema = DataSchema.read_schema(
    path, sep='\t', numeric_dtype=numpy.float32)
data = FileDataStream(path, schema=file_schema)
print(data.head())

#    Sentiment                                      SentimentText
# 0        1.0  ==RUDE== Dude, you are rude upload that carl p...
# 1        1.0  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIK...
# 2        1.0  Stop trolling, zapatancas, calling me a liar m...
# 3        1.0  ==You're cool==  You seem like a really cool g...
# 4        1.0  ::::: Why are you threatening me? I'm not bein...

# After using Character Tokenizer, it will convert the vector of Char to Key type.
# Use FromKey to retrieve the data from Key first, then send into WordEmbedding.

pipe = Pipeline([
        CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}),
        FromKey(columns={'SentimentText_FromKey': 'SentimentText_Transform'}),
コード例 #19
0
ファイル: test_data_schema.py プロジェクト: zyw400/NimbusML-1
 def test_schema_dtype_list_trueint(self):
     li = [[1, 1, 2], [3, 5, 6]]
     schema = DataSchema.read_schema(li)
     assert str(schema) == 'col=c0:I8:0 col=c1:I8:1 col=c2:I8:2 header=+'
コード例 #20
0
ファイル: test_data_schema.py プロジェクト: zyw400/NimbusML-1
 def test_schema_dtype_list_int(self):
     li = [[1.0, 1.0, 2.0], [3.0, 5.0, 6.0]]
     schema = DataSchema.read_schema(li)
     assert str(schema) == 'col=c0:R8:0 col=c1:R8:1 col=c2:R8:2 header=+'