def test_schema_collapse_all(self): path = get_dataset('infert').as_filepath() file_schema = DataSchema.read_schema(path, collapse='all', sep=',', numeric_dtype=numpy.float32, names={ 0: 'row_num', 5: 'case' }) file_schema.rename('age', 'Features') assert str( file_schema) == "col=row_num:R4:0 col=education:TX:1 " \ "col=Features:R4:2-4,6-8 col=case:R4:5 " \ "header=+ sep=,"
def test_data_schema_collapse_yes_file_loader(self): df = pandas.DataFrame(dict(tt=['a', 'b', 'cc', 'dd', 'ee'])) df['ff'] = 0.2 df['ff2'] = 0.1 df['tt1'] = 'rt' df['ii'] = 5 df['gg'] = 3.4 st = StringIO() df.to_csv(st, index=False) st = StringIO(st.getvalue()) sch = DataSchema.read_schema(st, collapse=True, tool='nimbusml') s = str(sch) self.assertEqual( s, 'col=ff:R8:1-2 col=gg:R8:5 col=ii:I8:4 col=tt:TX:0 ' 'col=tt1:TX:3 header=+')
def test_data_schema_collapse_no(self): df = pandas.DataFrame(dict(tt=['a', 'b', 'cc', 'dd', 'ee'])) df['ff'] = 0.2 df['ff2'] = 0.1 df['tt1'] = 'rt' df['ii'] = 5 df['gg'] = 3.4 st = StringIO() df.to_csv(st, index=False) st = StringIO(st.getvalue()) df = pandas.read_csv(st) sch = DataSchema.read_schema(df) s = str(sch) self.assertEqual( s, 'col=tt:TX:0 col=ff:R8:1 col=ff2:R8:2 col=tt1:TX:3 ' 'col=ii:I8:4 col=gg:R8:5 header=+')
def test_ensemble_supports_cv_without_user_defined_transforms(self): path = get_dataset("airquality").as_filepath() schema = DataSchema.read_schema(path) data = FileDataStream(path, schema) ind_args = {'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R'} handler_args = {'Solar_R': 'Solar_R', 'Ozone': 'Ozone'} lgbm_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'normalize': 'Yes' } ols_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'normalize': 'Yes' } ogd_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'shuffle': False, 'normalize': 'Yes' } pipeline = Pipeline([ Indicator() << ind_args, Handler(replace_with='Mean') << handler_args ]) transformed_data = pipeline.fit_transform(data, as_binary_data_stream=True) pipeline_steps = [LightGbmRegressor(**lgbm_args)] cv_results = CV(pipeline_steps).fit(transformed_data) l2_avg_lgbm = cv_results['metrics_summary'].loc['Average', 'L2(avg)'] r1 = OrdinaryLeastSquaresRegressor(**ols_args) r2 = OnlineGradientDescentRegressor(**ogd_args) r3 = LightGbmRegressor(**lgbm_args) pipeline_steps = [VotingRegressor(estimators=[r1, r2, r3], combiner='Average')] cv_results = CV(pipeline_steps).fit(transformed_data) l2_avg_ensemble = cv_results['metrics_summary'].loc['Average', 'L2(avg)'] self.assertTrue(l2_avg_ensemble < l2_avg_lgbm)
def test_schema_dtype_slice(self): path = get_dataset('gen_tickettrain').as_filepath() file_schema = DataSchema.read_schema(path, sep=',', collapse='all', names={ 0: 'Label', 1: 'GroupId' }, dtype={ 'GroupId': str, 'Label': numpy.float32, 'carrier': str, 'price': numpy.float32 }) assert str( file_schema) == 'col=Label:R4:0 col=GroupId:TX:1 ' \ 'col=carrier:TX:2 col=price:R4:3 ' \ 'col=Class:I8:4-6 col=duration:R8:7 header=+ ' \ 'sep=,'
def test_split_start_with_transforms_with_presteps(self): path = get_dataset("airquality").as_filepath() schema = DataSchema.read_schema(path) data = FileDataStream(path, schema) pipeline_steps = [ Indicator() << { 'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R' }, Handler(replace_with='Mean') << { 'Solar_R': 'Solar_R', 'Ozone': 'Ozone' }, LightGbmRegressor(feature=[ 'Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp' ], label='Wind') ] results = CV(pipeline_steps).fit(data, split_start='after_transforms', dry_run=True) results = json.loads(results) node_names = [ep['Name'] for ep in results['nodes']] cv_node = [ ep for ep in results['nodes'] if 'Models.CrossValidator' in ep['Name'] ][0] cv_sub_node_names = [ep['Name'] for ep in cv_node['Inputs']['Nodes']] self.assertTrue('Transforms.MissingValueHandler' in node_names) self.assertTrue( 'Transforms.MissingValueHandler' not in cv_sub_node_names) self.assertTrue('Transforms.ModelCombiner' in node_names)
############################################################################### # CV - cross-validate data import numpy as np from nimbusml import Pipeline, FileDataStream, DataSchema from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.categorical import OneHotVectorizer from nimbusml.linear_model import LogisticRegressionClassifier, \ FastLinearRegressor from nimbusml.model_selection import CV from nimbusml.preprocessing.missing_values import Indicator, Handler # Case 1: Default usage of CV path = get_dataset('infert').as_filepath() schema = DataSchema.read_schema(path, numeric_dtype=np.float32) data = FileDataStream.read_csv(path, schema=schema) pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), LogisticRegressionClassifier(feature=['age', 'spontaneous', 'edu'], label='induced') ]) # Do 3-fold cross-validation cv_results = CV(pipeline).fit(data, cv=3) # print summary statistic of metrics print(cv_results['metrics_summary']) # print metrics for all folds print(cv_results['metrics'])
def test_schema_dtype_numpy_float(self): li = [[1.0, 1.0, 2.0], [3.0, 5.0, 6.0]] mat = numpy.array(li) schema = DataSchema.read_schema(mat) assert str(schema) == 'col=Data:R8:0-2 header=+'
def test_schema_dtype_list_trueint(self): li = [[1, 1, 2], [3, 5, 6]] schema = DataSchema.read_schema(li) assert str(schema) == 'col=c0:I8:0 col=c1:I8:1 col=c2:I8:2 header=+'
def test_schema_dtype_list_int(self): li = [[1.0, 1.0, 2.0], [3.0, 5.0, 6.0]] schema = DataSchema.read_schema(li) assert str(schema) == 'col=c0:R8:0 col=c1:R8:1 col=c2:R8:2 header=+'
def test_schema_documentation(self): data = DataFrame( OrderedDict(real=[0.1, 0.2], integer=[1, 2], text=["a", "b"])) data['real32'] = data['real'].astype(numpy.float32) schema = DataSchema.read_schema(data) if sys.version_info[:2] >= (3, 6): assert str( schema) == 'col=real:R8:0 col=integer:I8:1 col=text:TX:2 ' \ 'col=real32:R4:3 header=+' data = DataFrame( OrderedDict(real=[0.1, 0.2], integer=[1, 2], text=["a", "b"])) data.to_csv('data.txt', index=False) schema = DataSchema.read_schema('data.txt') if sys.version_info[:2] >= (3, 6): assert str( schema) == 'col=real:R8:0 col=integer:I8:1 col=text:TX:2' \ ' header=+' data = DataFrame( OrderedDict(real=[0.1, 0.2], integer=[1, 2], text=["a", "b"])) data.to_csv('data.txt', index=False) schema = DataSchema.read_schema('data.txt') if sys.version_info[:2] >= (3, 6): assert str( schema) == 'col=real:R8:0 col=integer:I8:1 col=text:TX:2' \ ' header=+' data = DataFrame( OrderedDict(real=[0.1, 0.2], real2=[0.1, 0.2], integer=[1, 2], text=["a", "b"])) data.to_csv('data.txt', index=False) schema = DataSchema.read_schema('data.txt', collapse=True) if sys.version_info[:2] >= (3, 6): assert str( schema) == 'col=real:R8:0-1 col=integer:I8:2 ' \ 'col=text:TX:3 header=+' data = DataFrame( OrderedDict(real=[0.1, 0.2], text1=["a", "b"], text2=["a", "b"])) data.to_csv('data.txt', index=False) schema = DataSchema.read_schema('data.txt', collapse=True, names={ 0: 'newname', 1: 'newname2' }) if sys.version_info[:2] >= (3, 6): assert str( schema) == 'col=newname:R8:0 col=newname2:TX:1-2 header=+' data = DataFrame( OrderedDict(real=[0.1, 0.2], text1=["a", "b"], text2=["a", "b"])) data.to_csv('data.txt', index=False) schema = DataSchema.read_schema('data.txt', collapse=False, names={(1, None): 'text'}) if sys.version_info[:2] >= (3, 6): assert str( schema) == 'col=real:R8:0 col=text_0:TX:1 ' \ 'col=text_1:TX:2 header=+' data = DataFrame(OrderedDict(real=[0.1, 0.2], text1=["a", "b"])) data.to_csv('data.txt', index=False) schema = DataSchema.read_schema('data.txt', collapse=True, dtype={'real': numpy.float32}) if sys.version_info[:2] >= (3, 6): assert str(schema) == 'col=real:R4:0 col=text1:TX:1 header=+' for c in schema: assert repr(c).startswith("DataColumn(name='") assert repr(schema).startswith("DataSchema([DataColumn(name='")
############################################################################### # CharTokenizer import numpy from nimbusml import FileDataStream, DataSchema, Pipeline from nimbusml.datasets import get_dataset from nimbusml.preprocessing import FromKey from nimbusml.preprocessing.text import CharTokenizer from nimbusml.preprocessing.schema import ColumnSelector from nimbusml.feature_extraction.text import WordEmbedding # data input (as a FileDataStream) path = get_dataset('wiki_detox_train').as_filepath() file_schema = DataSchema.read_schema( path, sep='\t', numeric_dtype=numpy.float32) data = FileDataStream(path, schema=file_schema) print(data.head()) # Sentiment SentimentText # 0 1.0 ==RUDE== Dude, you are rude upload that carl p... # 1 1.0 == OK! == IM GOING TO VANDALIZE WILD ONES WIK... # 2 1.0 Stop trolling, zapatancas, calling me a liar m... # 3 1.0 ==You're cool== You seem like a really cool g... # 4 1.0 ::::: Why are you threatening me? I'm not bein... # After using Character Tokenizer, it will convert the vector of Char to Key type. # Use FromKey to retrieve the data from Key first, then send into WordEmbedding. pipe = Pipeline([ CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}), FromKey(columns={'SentimentText_FromKey': 'SentimentText_Transform'}),