def test_data_schema_read_schema_tab(self): df = pandas.DataFrame( dict(a=[0, 1], b=[0.1, 1.1], c=['r', 'd'], d=[False, True])) sch = DataSchema.read_schema(df) assert str( sch) == 'col=a:I8:0 col=b:R8:1 col=c:TX:2 col=d:BL:3 header=+' sch = DataSchema.read_schema(df, sep='\t') assert str( sch) == 'col=a:I8:0 col=b:R8:1 col=c:TX:2 col=d:BL:3 ' \ 'header=+ sep=tab'
def test_schema_dtype_regex(self): path = get_dataset('gen_tickettrain').as_filepath() file_schema = DataSchema.read_schema(path, collapse='all', sep=',', names={ 0: 'Label', 1: 'GroupId', 2: 'carrier', (3, None): 'Features' }, dtype={ 'GroupId': str, 'Label': numpy.float32, 'carrier': str, 'Features_[0-9]{1,2}': numpy.float32 }) file_schema.rename('Features_0', 'Features') assert str( file_schema) == 'col=Label:R4:0 col=GroupId:TX:1 ' \ 'col=carrier:TX:2 col=Features:R4:3-7 ' \ 'header=+ sep=,'
def test_data_schema_read_schema(self): df = pandas.DataFrame( dict(a=[0, 1], b=[0.1, 1.1], c=['r', 'd'], d=[False, True])) sch = DataSchema.read_schema(df) assert str( sch) == 'col=a:I8:0 col=b:R8:1 col=c:TX:2 col=d:BL:3 header=+' sch = DataSchema.read_schema(df, sep=',') assert str( sch) == 'col=a:I8:0 col=b:R8:1 col=c:TX:2 col=d:BL:3 ' \ 'header=+ sep=,' csr = csr_matrix([[0, 1], [1, 0]], dtype='int32') sch = DataSchema.read_schema(csr, sep=',') assert str(sch) == 'col=Data:I4:0-1 header=+ sep=,' csr = matrix([[0, 1], [1, 0]], dtype='int32') sch = DataSchema.read_schema(csr, sep=',') assert str(sch) == 'col=Data:I4:0-1 header=+ sep=,' csr = matrix([[0, 1], [1.5, 0.5]]) sch = DataSchema.read_schema(csr, sep=',') assert str(sch) == 'col=Data:R8:0-1 header=+ sep=,'
def test_defaults(self): schema = DataSchema.read_schema(infert_file, numeric_dtype=np.float32) data = FileDataStream.read_csv(infert_file, schema=schema) pipeline_steps = [ OneHotVectorizer(columns={'edu': 'education'}), KMeansPlusPlus( n_clusters=5, feature=['edu', 'age', 'parity', 'spontaneous', 'stratum']) ] check_cv(pipeline_steps, data)
def test_schema_airquality(self): train_file = get_dataset("airquality").as_filepath() found = DataSchema.read_schema(train_file) schema = "col=Unnamed0:I8:0 col=Ozone:R8:1 col=Solar_R:R8:2 " \ "col=Wind:R8:3 col=Temp:I8:4 col=Month:I8:5 " \ "col=Day:I8:6 header=+" assert str(found) == schema fds = FileDataStream(train_file, schema) assert str(fds.schema) == schema fds = FileDataStream.read_csv(train_file) assert str(fds.schema) == schema
def test_schema_infert_R4(self): train_file = get_dataset("infert").as_filepath() found = DataSchema.read_schema(train_file, numeric_dtype=numpy.float32) schema = "col=row_num:R4:0 col=education:TX:1 col=age:R4:2 " \ "col=parity:R4:3 col=induced:R4:4 " + \ "col=case:R4:5 col=spontaneous:R4:6 col=stratum:R4:7 " \ "col=pooled.stratum:R4:8 header=+" assert str(found) == schema fds = FileDataStream(train_file, schema) assert str(fds.schema) == schema fds = FileDataStream.read_csv(train_file, numeric_dtype=numpy.float32) assert str(fds.schema) == schema
def test_schema_infert(self): train_file = get_dataset("infert").as_filepath() found = DataSchema.read_schema(train_file) schema = "col=row_num:I8:0 col=education:TX:1 col=age:I8:2 " \ "col=parity:I8:3 col=induced:I8:4 " + \ "col=case:I8:5 col=spontaneous:I8:6 col=stratum:I8:7 " \ "col=pooled.stratum:I8:8 header=+" assert str(found) == schema fds = FileDataStream(train_file, schema) assert str(fds.schema) == schema fds = FileDataStream.read_csv(train_file) assert str(fds.schema) == schema
def test_schema_dtype_numpy_trueint(self): li = [[1, 1, 2], [3, 5, 6]] mat = numpy.array(li) dt = mat.dtype schema = DataSchema.read_schema(mat) # The behavior is not the same on every OS. if dt == numpy.int64: assert str(schema) == 'col=Data:I8:0-2 header=+' elif dt == numpy.int32: assert str(schema) == 'col=Data:I4:0-2 header=+' else: raise TypeError("unexpected type {0}".format(dt))
def test_data_header_no_dataframe(self): li = [1.0, 1.0, 2.0] df = pandas.DataFrame(li) schema0 = DataSchema.read_schema(df) assert str(schema0) == 'col=c0:R8:0 header=+' li = [[1.0, 1.0, 2.0], [3.0, 5.0, 6.0]] schema1 = DataSchema.read_schema(li) assert str(schema1) == 'col=c0:R8:0 col=c1:R8:1 col=c2:R8:2 header=+' df = pandas.DataFrame([[1.0, 1.0, 2.0], [3.0, 5.0, 6.0]]) schema2 = DataSchema.read_schema(df) assert str(schema2) == 'col=c0:R8:0 col=c1:R8:1 col=c2:R8:2 header=+' mat = numpy.array([[1.0, 1.0, 2.0], [3.0, 5.0, 6.0]]) schema3 = DataSchema.read_schema(mat) assert str(schema3) == 'col=Data:R8:0-2 header=+' li = [1.0, 1.0, 2.0] df = pandas.DataFrame(li) schema0 = DataSchema.read_schema(df, header=False) assert str(schema0) == 'col=c0:R8:0 header=-'
def test_ensemble_supports_cv_with_user_defined_transforms(self): path = get_dataset("airquality").as_filepath() schema = DataSchema.read_schema(path) data = FileDataStream(path, schema) ind_args = {'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R'} handler_args = {'Solar_R': 'Solar_R', 'Ozone': 'Ozone'} lgbm_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'normalize': 'Yes' } ols_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'normalize': 'Yes' } ogd_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'shuffle': False, 'normalize': 'Yes' } for split_start in ['before_transforms', 'after_transforms']: pipeline_steps = [ Indicator() << ind_args, Handler(replace_with='Mean') << handler_args, LightGbmRegressor(**lgbm_args) ] cv_results = CV(pipeline_steps).fit(data, split_start=split_start) l2_avg_lgbm = cv_results['metrics_summary'].loc['Average', 'L2(avg)'] r1 = OrdinaryLeastSquaresRegressor(**ols_args) r2 = OnlineGradientDescentRegressor(**ogd_args) r3 = LightGbmRegressor(**lgbm_args) data = FileDataStream(path, schema) pipeline_steps = [ Indicator() << ind_args, Handler(replace_with='Mean') << handler_args, VotingRegressor(estimators=[r1, r2, r3], combiner='Average') ] cv_results = CV(pipeline_steps).fit(data, split_start=split_start) l2_avg_ensemble = cv_results['metrics_summary'].loc['Average', 'L2(avg)'] self.assertTrue(l2_avg_ensemble < l2_avg_lgbm)
def test_data_schema_collapse_no_file(self): df = pandas.DataFrame(dict(tt=['a', 'b', 'cc', 'dd', 'ee'])) df['ff'] = 0.2 df['ff2'] = 0.1 df['tt1'] = 'rt' df['ii'] = 5 df['gg'] = 3.4 st = StringIO() df.to_csv(st, index=False) st = StringIO(st.getvalue()) sch = DataSchema.read_schema(st) s = str(sch) self.assertEqual( s, 'col=tt:TX:0 col=ff:R8:1 col=ff2:R8:2 col=tt1:TX:3 ' 'col=ii:I8:4 col=gg:R8:5 header=+')
def test_schema_collapse_all(self): path = get_dataset('infert').as_filepath() file_schema = DataSchema.read_schema(path, collapse='all', sep=',', numeric_dtype=numpy.float32, names={ 0: 'row_num', 5: 'case' }) file_schema.rename('age', 'Features') assert str( file_schema) == "col=row_num:R4:0 col=education:TX:1 " \ "col=Features:R4:2-4,6-8 col=case:R4:5 " \ "header=+ sep=,"
def test_schema_dtype_slice(self): path = get_dataset('gen_tickettrain').as_filepath() file_schema = DataSchema.read_schema(path, sep=',', collapse='all', names={ 0: 'Label', 1: 'GroupId' }, dtype={ 'GroupId': str, 'Label': numpy.float32, 'carrier': str, 'price': numpy.float32 }) assert str( file_schema) == 'col=Label:R4:0 col=GroupId:TX:1 ' \ 'col=carrier:TX:2 col=price:R4:3 ' \ 'col=Class:I8:4-6 col=duration:R8:7 header=+ ' \ 'sep=,'
def test_split_start_with_transforms_with_presteps(self): path = get_dataset("airquality").as_filepath() schema = DataSchema.read_schema(path) data = FileDataStream(path, schema) pipeline_steps = [ Indicator() << { 'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R' }, Handler(replace_with='Mean') << { 'Solar_R': 'Solar_R', 'Ozone': 'Ozone' }, LightGbmRegressor(feature=[ 'Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp' ], label='Wind') ] results = CV(pipeline_steps).fit(data, split_start='after_transforms', dry_run=True) results = json.loads(results) node_names = [ep['Name'] for ep in results['nodes']] cv_node = [ ep for ep in results['nodes'] if 'Models.CrossValidator' in ep['Name'] ][0] cv_sub_node_names = [ep['Name'] for ep in cv_node['Inputs']['Nodes']] self.assertTrue('Transforms.MissingValueHandler' in node_names) self.assertTrue( 'Transforms.MissingValueHandler' not in cv_sub_node_names) self.assertTrue('Transforms.ModelCombiner' in node_names)
def test_schema_documentation(self): data = DataFrame( OrderedDict(real=[0.1, 0.2], integer=[1, 2], text=["a", "b"])) data['real32'] = data['real'].astype(numpy.float32) schema = DataSchema.read_schema(data) if sys.version_info[:2] >= (3, 6): assert str( schema) == 'col=real:R8:0 col=integer:I8:1 col=text:TX:2 ' \ 'col=real32:R4:3 header=+' data = DataFrame( OrderedDict(real=[0.1, 0.2], integer=[1, 2], text=["a", "b"])) data.to_csv('data.txt', index=False) schema = DataSchema.read_schema('data.txt') if sys.version_info[:2] >= (3, 6): assert str( schema) == 'col=real:R8:0 col=integer:I8:1 col=text:TX:2' \ ' header=+' data = DataFrame( OrderedDict(real=[0.1, 0.2], integer=[1, 2], text=["a", "b"])) data.to_csv('data.txt', index=False) schema = DataSchema.read_schema('data.txt') if sys.version_info[:2] >= (3, 6): assert str( schema) == 'col=real:R8:0 col=integer:I8:1 col=text:TX:2' \ ' header=+' data = DataFrame( OrderedDict(real=[0.1, 0.2], real2=[0.1, 0.2], integer=[1, 2], text=["a", "b"])) data.to_csv('data.txt', index=False) schema = DataSchema.read_schema('data.txt', collapse=True) if sys.version_info[:2] >= (3, 6): assert str( schema) == 'col=real:R8:0-1 col=integer:I8:2 ' \ 'col=text:TX:3 header=+' data = DataFrame( OrderedDict(real=[0.1, 0.2], text1=["a", "b"], text2=["a", "b"])) data.to_csv('data.txt', index=False) schema = DataSchema.read_schema('data.txt', collapse=True, names={ 0: 'newname', 1: 'newname2' }) if sys.version_info[:2] >= (3, 6): assert str( schema) == 'col=newname:R8:0 col=newname2:TX:1-2 header=+' data = DataFrame( OrderedDict(real=[0.1, 0.2], text1=["a", "b"], text2=["a", "b"])) data.to_csv('data.txt', index=False) schema = DataSchema.read_schema('data.txt', collapse=False, names={(1, None): 'text'}) if sys.version_info[:2] >= (3, 6): assert str( schema) == 'col=real:R8:0 col=text_0:TX:1 ' \ 'col=text_1:TX:2 header=+' data = DataFrame(OrderedDict(real=[0.1, 0.2], text1=["a", "b"])) data.to_csv('data.txt', index=False) schema = DataSchema.read_schema('data.txt', collapse=True, dtype={'real': numpy.float32}) if sys.version_info[:2] >= (3, 6): assert str(schema) == 'col=real:R4:0 col=text1:TX:1 header=+' for c in schema: assert repr(c).startswith("DataColumn(name='") assert repr(schema).startswith("DataSchema([DataColumn(name='")
############################################################################### # CV - cross-validate data import numpy as np from nimbusml import Pipeline, FileDataStream, DataSchema from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.categorical import OneHotVectorizer from nimbusml.linear_model import LogisticRegressionClassifier, \ FastLinearRegressor from nimbusml.model_selection import CV from nimbusml.preprocessing.missing_values import Indicator, Handler # Case 1: Default usage of CV path = get_dataset('infert').as_filepath() schema = DataSchema.read_schema(path, numeric_dtype=np.float32) data = FileDataStream.read_csv(path, schema=schema) pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), LogisticRegressionClassifier(feature=['age', 'spontaneous', 'edu'], label='induced') ]) # Do 3-fold cross-validation cv_results = CV(pipeline).fit(data, cv=3) # print summary statistic of metrics print(cv_results['metrics_summary']) # print metrics for all folds print(cv_results['metrics'])
def test_schema_dtype_numpy_float(self): li = [[1.0, 1.0, 2.0], [3.0, 5.0, 6.0]] mat = numpy.array(li) schema = DataSchema.read_schema(mat) assert str(schema) == 'col=Data:R8:0-2 header=+'
############################################################################### # CharTokenizer import numpy from nimbusml import FileDataStream, DataSchema, Pipeline from nimbusml.datasets import get_dataset from nimbusml.preprocessing import FromKey from nimbusml.preprocessing.text import CharTokenizer from nimbusml.preprocessing.schema import ColumnSelector from nimbusml.feature_extraction.text import WordEmbedding # data input (as a FileDataStream) path = get_dataset('wiki_detox_train').as_filepath() file_schema = DataSchema.read_schema( path, sep='\t', numeric_dtype=numpy.float32) data = FileDataStream(path, schema=file_schema) print(data.head()) # Sentiment SentimentText # 0 1.0 ==RUDE== Dude, you are rude upload that carl p... # 1 1.0 == OK! == IM GOING TO VANDALIZE WILD ONES WIK... # 2 1.0 Stop trolling, zapatancas, calling me a liar m... # 3 1.0 ==You're cool== You seem like a really cool g... # 4 1.0 ::::: Why are you threatening me? I'm not bein... # After using Character Tokenizer, it will convert the vector of Char to Key type. # Use FromKey to retrieve the data from Key first, then send into WordEmbedding. pipe = Pipeline([ CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}), FromKey(columns={'SentimentText_FromKey': 'SentimentText_Transform'}),
def test_schema_dtype_list_trueint(self): li = [[1, 1, 2], [3, 5, 6]] schema = DataSchema.read_schema(li) assert str(schema) == 'col=c0:I8:0 col=c1:I8:1 col=c2:I8:2 header=+'
def test_schema_dtype_list_int(self): li = [[1.0, 1.0, 2.0], [3.0, 5.0, 6.0]] schema = DataSchema.read_schema(li) assert str(schema) == 'col=c0:R8:0 col=c1:R8:1 col=c2:R8:2 header=+'