def test_fasttreestweedieregressor(self): np.random.seed(0) df = get_dataset("airquality").as_df().fillna(0) df = df[df.Ozone.notnull()] X_train, X_test, y_train, y_test = train_test_split( df.loc[:, df.columns != 'Ozone'], df['Ozone']) # Train a model and score ftree = FastTreesTweedieRegressor().fit(X_train, y_train) scores = ftree.predict(X_test) r2 = r2_score(y_test, scores) assert_greater(r2, 0.479, "sum should be greater than %s" % 0.479) assert_less(r2, 0.480, "sum should be less than %s" % 0.480)
'CharTokenizer': CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}), 'ColumnConcatenator': ColumnConcatenator(columns={'Features': [ 'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Setosa']}), 'ColumnSelector': ColumnSelector(columns=['Sepal_Width', 'Sepal_Length']), 'ColumnDuplicator': ColumnDuplicator(columns={'dup': 'Sepal_Width'}), 'CountSelector': CountSelector(count=5, columns=['Sepal_Width']), 'DateTimeSplitter': DateTimeSplitter(prefix='dt'), 'FastForestBinaryClassifier': FastForestBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'], label='Setosa'), 'FastLinearBinaryClassifier': FastLinearBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'], label='Setosa'), 'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'), 'Filter': Filter(columns=[ 'Petal_Length', 'Petal_Width']), 'FromKey': Pipeline([ ToKey(columns=['Sepal_Length']), FromKey(columns=['Sepal_Length']) ]), # GlobalContrastRowScaler currently requires a vector input to work 'GlobalContrastRowScaler': Pipeline([ ColumnConcatenator() << { 'concated_columns': [ 'Petal_Length', 'Sepal_Width', 'Sepal_Length']}, GlobalContrastRowScaler(columns={'normed_columns': 'concated_columns'}) ]), 'Handler': Handler(replace_with='Mean', columns={'NewVals': 'Petal_Length'}),
train_file = get_dataset("uciadult_train").as_filepath() categorical_columns = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'ethnicity', 'sex', 'native-country-region' ] file_schema = 'sep=, col=label:R4:0 col=Features:R4:9-14 col=workclass:TX:1 ' \ 'col=education:TX:2 col=marital-status:TX:3 ' \ 'col=occupation:TX:4 col=relationship:TX:5 col=ethnicity:TX:6 ' \ 'col=sex:TX:7 col=native-country-region:TX:8 header+' label_column = 'label' learners = [ FastForestBinaryClassifier(), FastForestRegressor(), FastTreesBinaryClassifier(), FastTreesRegressor(), FastTreesTweedieRegressor(), LightGbmRegressor(), LightGbmBinaryClassifier(), AveragedPerceptronBinaryClassifier(), FastLinearBinaryClassifier(), FastLinearClassifier(), FastLinearRegressor(), LogisticRegressionBinaryClassifier(), LogisticRegressionClassifier(), OnlineGradientDescentRegressor(), SgdBinaryClassifier(), # SymSgdBinaryClassifier(), OrdinaryLeastSquaresRegressor(), PoissonRegressionRegressor() ]
# Unable to load shared library 'SymSgdNative' or one of its dependencies #SymSgdBinaryClassifier(), OrdinaryLeastSquaresRegressor(), PoissonRegressionRegressor(), OneVsRestClassifier(FastLinearBinaryClassifier()), GamRegressor(), GamBinaryClassifier(), PcaAnomalyDetector(), FactorizationMachineBinaryClassifier(), KMeansPlusPlus(n_clusters=2), NaiveBayesClassifier(), FastForestBinaryClassifier(number_of_trees=2), FastForestRegressor(number_of_trees=2), FastTreesBinaryClassifier(number_of_trees=2), FastTreesRegressor(number_of_trees=2), FastTreesTweedieRegressor(number_of_trees=2), LightGbmRegressor(number_of_iterations=2), LightGbmClassifier(), LightGbmBinaryClassifier(number_of_iterations=2) ] learners_not_supported = [ #PcaTransformer(), # REVIEW: crashes ] class TestModelSummary(unittest.TestCase): def test_model_summary(self): for learner in learners: pipeline = Pipeline(
############################################################################### # FastTreesRegressor import numpy as np from nimbusml.datasets import get_dataset from nimbusml.ensemble import FastTreesTweedieRegressor from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split # use the built-in data set 'airquality' to create test and train data # Unnamed: 0 Ozone Solar_R Wind Temp Month Day # 0 1 41.0 190.0 7.4 67 5 1 # 1 2 36.0 118.0 8.0 72 5 2 np.random.seed(0) df = get_dataset("airquality").as_df().fillna(0) df = df[df.Ozone.notnull()] X_train, X_test, y_train, y_test = train_test_split( df.loc[:, df.columns != 'Ozone'], df['Ozone']) # train a model and score ftree = FastTreesTweedieRegressor().fit(X_train, y_train) scores = ftree.predict(X_test) # evaluate the model print('R-squared fit:', r2_score(y_test, scores))
# data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), FastTreesTweedieRegressor(feature=['induced', 'edu'], label='age') ]) # train, predict, and evaluate metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # Score # 0 35.152565 # 1 35.152565 # 2 34.089958 # 3 34.089958 # 4 32.486031 # print evaluation metrics print(metrics)