def build_auto_h2o(regressor, name): transformer = ColumnTransformer( [(column, CategoricalDomain(), [column]) for column in ["cylinders", "model_year", "origin"]] + [(column, ContinuousDomain(), [column]) for column in ["displacement", "horsepower", "weight", "acceleration"]]) pipeline = PMMLPipeline([("transformer", transformer), ("uploader", H2OFrameCreator(column_names=[ "cylinders", "model_year", "origin", "displacement", "horsepower", "weight", "acceleration" ], column_types=[ "enum", "enum", "enum", "numeric", "numeric", "numeric", "numeric" ])), ("regressor", regressor)]) pipeline.fit(auto_X, H2OFrame(auto_y.to_frame())) pipeline.verify(auto_X.sample(frac=0.05, random_state=13)) regressor = pipeline._final_estimator store_mojo(regressor, name + ".zip") store_pkl(pipeline, name + ".pkl") mpg = pipeline.predict(auto_X) mpg.set_names(["mpg"]) store_csv(mpg.as_data_frame(), name + ".csv")
def build_audit_h2o(classifier, name): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] + [([column], CategoricalDomain()) for column in ["Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("uploader", H2OFrameCreator()), ("classifier", classifier) ]) pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(), column_types = ["categorical"])) pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13)) classifier = pipeline._final_estimator store_mojo(classifier, name) store_pkl(pipeline, name) adjusted = pipeline.predict(audit_X) adjusted.set_names(["h2o(Adjusted)", "probability(0)", "probability(1)"]) store_csv(adjusted.as_data_frame(), name)
("Income", ContinuousDomain()), (["Hours", "Income"], Alias(ExpressionTransformer("X[1] / (X[0] * 52)"), "Hourly_Income"))]) classifier = H2ORandomForestEstimator(ntrees=17) predict_proba_transformer = Pipeline([ ("expression", ExpressionTransformer("X[1]")), ("cut", Alias(CutTransformer(bins=[0.0, 0.75, 0.90, 1.0], labels=["no", "maybe", "yes"]), "Decision", prefit=True)) ]) pipeline = PMMLPipeline([("local_mapper", mapper), ("uploader", H2OFrameCreator()), ("remote_classifier", classifier)], predict_proba_transformer=predict_proba_transformer) pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(), column_types=["categorical"])) pipeline.verify(audit_X.sample(100)) sklearn2pmml(pipeline, "pmml/RandomForestAudit.pmml") if "--deploy" in sys.argv: from openscoring import Openscoring os = Openscoring("http://localhost:8080/openscoring") os.deployFile("RandomForestAudit", "pmml/RandomForestAudit.pmml")
Pipeline([('extract', ColumnExtractor(['user_id', 'ip_address'])), ('groupby_count', AddGroupByCount())])), ('numerics', Pipeline([('extract', ColumnExtractor(NUM_FEATS)), ('zero_fill', ZeroFillTransformer()), ('log', Log1pTransformer())])) ])) ]) ############################## # Modeling + Tuning ############################## from h2o.cross_validation import H2OKFold dataset = pd.concat([X_train, y_train], axis=1) cv = H2OKFold(dataset, n_folds=5, seed=42) # H2O approach ("H2OCreator", H2OFrameCreator()), # ('standardize', H2OScaler()), # ('pca', H2OPCA()), ('rf', H2ORandomForestEstimator(ntrees=20)) # something new to try # from scipy.stats import randint # params = { # # "standardize__center": [True, False], # # "standardize__scale": [True, False], # "pca__k": 2, # # randint(2, X_train[1:].shape[1]), # "rf__ntrees": 20 # # randint(50,60), # # "rf__max_depth": randint(4,8), # # "rf__min_rows": randint(5,10),