Example #1
0
def build_auto_h2o(regressor, name):
    transformer = ColumnTransformer(
        [(column, CategoricalDomain(), [column])
         for column in ["cylinders", "model_year", "origin"]] +
        [(column, ContinuousDomain(), [column]) for column in
         ["displacement", "horsepower", "weight", "acceleration"]])
    pipeline = PMMLPipeline([("transformer", transformer),
                             ("uploader",
                              H2OFrameCreator(column_names=[
                                  "cylinders", "model_year", "origin",
                                  "displacement", "horsepower", "weight",
                                  "acceleration"
                              ],
                                              column_types=[
                                                  "enum", "enum", "enum",
                                                  "numeric", "numeric",
                                                  "numeric", "numeric"
                                              ])), ("regressor", regressor)])
    pipeline.fit(auto_X, H2OFrame(auto_y.to_frame()))
    pipeline.verify(auto_X.sample(frac=0.05, random_state=13))
    regressor = pipeline._final_estimator
    store_mojo(regressor, name + ".zip")
    store_pkl(pipeline, name + ".pkl")
    mpg = pipeline.predict(auto_X)
    mpg.set_names(["mpg"])
    store_csv(mpg.as_data_frame(), name + ".csv")
Example #2
0
def build_audit_h2o(classifier, name):
	mapper = DataFrameMapper(
		[([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] +
		[([column], CategoricalDomain()) for column in ["Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("uploader", H2OFrameCreator()),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(), column_types = ["categorical"]))
	pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13))
	classifier = pipeline._final_estimator
	store_mojo(classifier, name)
	store_pkl(pipeline, name)
	adjusted = pipeline.predict(audit_X)
	adjusted.set_names(["h2o(Adjusted)", "probability(0)", "probability(1)"])
	store_csv(adjusted.as_data_frame(), name)
                          ("Income", ContinuousDomain()),
                          (["Hours", "Income"],
                           Alias(ExpressionTransformer("X[1] / (X[0] * 52)"),
                                 "Hourly_Income"))])
classifier = H2ORandomForestEstimator(ntrees=17)

predict_proba_transformer = Pipeline([
    ("expression", ExpressionTransformer("X[1]")),
    ("cut",
     Alias(CutTransformer(bins=[0.0, 0.75, 0.90, 1.0],
                          labels=["no", "maybe", "yes"]),
           "Decision",
           prefit=True))
])

pipeline = PMMLPipeline([("local_mapper", mapper),
                         ("uploader", H2OFrameCreator()),
                         ("remote_classifier", classifier)],
                        predict_proba_transformer=predict_proba_transformer)
pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(),
                               column_types=["categorical"]))

pipeline.verify(audit_X.sample(100))

sklearn2pmml(pipeline, "pmml/RandomForestAudit.pmml")

if "--deploy" in sys.argv:
    from openscoring import Openscoring

    os = Openscoring("http://localhost:8080/openscoring")
    os.deployFile("RandomForestAudit", "pmml/RandomForestAudit.pmml")
Example #4
0
          Pipeline([('extract', ColumnExtractor(['user_id', 'ip_address'])),
                    ('groupby_count', AddGroupByCount())])),
         ('numerics',
          Pipeline([('extract', ColumnExtractor(NUM_FEATS)),
                    ('zero_fill', ZeroFillTransformer()),
                    ('log', Log1pTransformer())]))
     ]))
])
##############################
# Modeling + Tuning
##############################
from h2o.cross_validation import H2OKFold
dataset = pd.concat([X_train, y_train], axis=1)
cv = H2OKFold(dataset, n_folds=5, seed=42)
# H2O approach
("H2OCreator", H2OFrameCreator()),
# ('standardize', H2OScaler()),
# ('pca', H2OPCA()),
('rf', H2ORandomForestEstimator(ntrees=20))

# something new to try
# from scipy.stats import randint
# params = {
#           # "standardize__center":    [True, False],
#           # "standardize__scale":     [True, False],
#           "pca__k":  2,
#               # randint(2, X_train[1:].shape[1]),
#           "rf__ntrees": 20
# # randint(50,60),
#           # "rf__max_depth":          randint(4,8),
#           # "rf__min_rows":           randint(5,10),