def build_audit(classifier, name, **pmml_options): if isinstance(classifier, LGBMClassifier): cat_columns = ["Age", "Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"] cont_columns = ["Income", "Hours"] else: cat_columns = ["Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"] cont_columns = ["Age", "Income", "Hours"] if isinstance(classifier, LGBMClassifier): cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns] else: cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns] cont_mappings = [([cont_column], cont_domain(name)) for cont_column in cont_columns] mapper = DataFrameMapper(cat_mappings + cont_mappings) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ]) if isinstance(classifier, LGBMClassifier): pipeline.fit(audit_X, audit_y, classifier__categorical_feature = [0, 1, 2, 3, 4, 5]) elif isinstance(classifier, XGBClassifier): if name == "XGBoostAuditNA": audit_X["Age"] = audit_X["Age"].astype(float) pipeline.fit(audit_X, audit_y) else: pipeline.fit(audit_X, audit_y) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_X.sample(n = 3, random_state = 13)) pipeline.configure(**pmml_options) store_pmml(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"]) adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"]) store_csv(pandas.concat((adjusted, adjusted_proba), axis = 1), name)
def build_auto(regressor, name, **pmml_options): cylinders_origin_mapping = { (8, 1) : "8/1", (6, 1) : "6/1", (4, 1) : "4/1", (6, 2) : "6/2", (4, 2) : "4/2", (6, 3) : "6/3", (4, 3) : "4/3" } mapper = DataFrameMapper([ (["cylinders", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), LabelBinarizer()]), (["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]), (["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {"alias" : "weight / displacement + 0.5"}) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name)
def build_auto(regressor, name, **pmml_options): cat_columns = ["cylinders", "model_year", "origin"] cont_columns = ["displacement", "horsepower", "weight", "acceleration"] if isinstance(regressor, LGBMRegressor): cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns] else: cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns] cont_mappings = [([cont_column], [cont_domain(name)]) for cont_column in cont_columns] mapper = DataFrameMapper(cat_mappings + cont_mappings) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) if isinstance(regressor, LGBMRegressor): pipeline.fit(auto_X, auto_y, regressor__categorical_feature = [0, 1, 2]) elif isinstance(regressor, IsolationForest): pipeline.fit(auto_X) else: pipeline.fit(auto_X, auto_y) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(n = 3, random_state = 13)) pipeline.configure(**pmml_options) store_pmml(pipeline, name) if isinstance(regressor, IsolationForest): decision_function = DataFrame(pipeline.decision_function(auto_X), columns = ["decisionFunction"]) outlier = DataFrame(pipeline.predict(auto_X), columns = ["outlier"]) outlier['outlier'] = outlier['outlier'].apply(lambda x: str(bool(x == -1)).lower()) store_csv(pandas.concat((decision_function, outlier), axis = 1), name) else: mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name)
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options): cylinders_origin_mapping = { (8, 1) : "8/1", (6, 1) : "6/1", (4, 1) : "4/1", (6, 2) : "6/2", (4, 2) : "4/2", (4, 3) : "4/3" } mapper = DataFrameMapper([ (["cylinders"], [CategoricalDomain(), Alias(ExpressionTransformer("X[0] % 2.0 > 0.0", dtype = numpy.int8), name = "odd(cylinders)", prefit = True)]), (["cylinders", "origin"], [MultiDomain([None, CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), OneHotEncoder()]), (["model_year"], [CategoricalDomain(), CastTransformer(str), ExpressionTransformer("'19' + X[0] + '-01-01'"), CastTransformer("datetime64[D]"), DaysSinceYearTransformer(1977), Binarizer(threshold = 0)], {"alias" : "bin(model_year, 1977)"}), (["model_year", "origin"], [ConcatTransformer("/"), OneHotEncoder(sparse = False), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]), (["weight", "displacement"], [ContinuousDomain(), ExpressionTransformer("(X[0] / X[1]) + 0.5", dtype = numpy.float64)], {"alias" : "weight / displacement + 0.5"}), (["displacement", "horsepower", "weight", "acceleration"], [MultiDomain([None, ContinuousDomain(), None, ContinuousDomain()]), StandardScaler()]) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y, **fit_params) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"]) store_csv(mpg, name)
def main(input_data, output_data, model_dest): logger = logging.getLogger(__name__) logger.info("Loading input and output data") inputs = pd.read_csv(input_data) outputs = pd.read_csv(output_data) X_train, X_test, y_train, y_test = train_test_split(inputs, outputs, test_size=0.4, random_state=23) model = RandomForestRegressor(verbose=True, n_jobs=-1) logger.info("Fitting model") model.fit(X_train, y_train) model.fit(X_train, y_train) logger.info("Saving joblib model") dump(model, model_dest + ".joblib") pipeline = PMMLPipeline([("classifier", RandomForestRegressor(verbose=True, n_jobs=-1))]) pipeline.fit(X_train, y_train) pipeline.verify(X_test.sample(n=10)) logger.info("Saving PMML model") sklearn2pmml(pipeline, model_dest + ".pmml")
def build_auto_h2o(regressor, name): transformer = ColumnTransformer( [(column, CategoricalDomain(), [column]) for column in ["cylinders", "model_year", "origin"]] + [(column, ContinuousDomain(), [column]) for column in ["displacement", "horsepower", "weight", "acceleration"]]) pipeline = PMMLPipeline([("transformer", transformer), ("uploader", H2OFrameCreator(column_names=[ "cylinders", "model_year", "origin", "displacement", "horsepower", "weight", "acceleration" ], column_types=[ "enum", "enum", "enum", "numeric", "numeric", "numeric", "numeric" ])), ("regressor", regressor)]) pipeline.fit(auto_X, H2OFrame(auto_y.to_frame())) pipeline.verify(auto_X.sample(frac=0.05, random_state=13)) regressor = pipeline._final_estimator store_mojo(regressor, name + ".zip") store_pkl(pipeline, name + ".pkl") mpg = pipeline.predict(auto_X) mpg.set_names(["mpg"]) store_csv(mpg.as_data_frame(), name + ".csv")
def build_iris(classifier, name, **pmml_options): cont_columns = [ "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width" ] cont_mappings = [([cont_column], ContinuousDomain()) for cont_column in cont_columns] mapper = DataFrameMapper(cont_mappings) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(iris_X, iris_y) if isinstance(classifier, XGBClassifier): pipeline.verify(iris_X.sample(n=3, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(iris_X.sample(n=3, random_state=13)) pipeline.configure(**pmml_options) store_pmml(pipeline, name) species = DataFrame(pipeline.predict(iris_X), columns=["Species"]) species_proba = DataFrame(pipeline.predict_proba(iris_X), columns=[ "probability(setosa)", "probability(versicolor)", "probability(virginica)" ]) store_csv(pandas.concat((species, species_proba), axis=1), name)
def build_auto_na(regressor, name, predict_transformer = None, apply_transformer = None, **pmml_options): mapper = DataFrameMapper( [([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year"]] + [(["origin"], [CategoricalDomain(missing_values = -1), SimpleImputer(missing_values = -1, strategy = "most_frequent"), OneHotEncoder()])] + [(["acceleration"], [ContinuousDomain(missing_values = None), CutTransformer(bins = [5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25], labels = False), CategoricalImputer(), LabelBinarizer()])] + [(["displacement"], [ContinuousDomain(missing_values = None), SimpleImputer(), CutTransformer(bins = [0, 100, 200, 300, 400, 500], labels = ["XS", "S", "M", "L", "XL"]), LabelBinarizer()])] + [(["horsepower"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 50, high_value = 225), SimpleImputer(strategy = "median")])] + [(["weight"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 2000, high_value = 5000), SimpleImputer(strategy = "median")])] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ], predict_transformer = predict_transformer, apply_transformer = apply_transformer) pipeline.fit(auto_na_X, auto_na_y) if isinstance(regressor, DecisionTreeRegressor): tree = regressor.tree_ node_impurity = {node_idx : tree.impurity[node_idx] for node_idx in range(0, tree.node_count) if tree.impurity[node_idx] != 0.0} pmml_options["node_extensions"] = {regressor.criterion : node_impurity} pipeline.configure(**pmml_options) pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"]) if isinstance(regressor, DecisionTreeRegressor): Xt = pipeline_transform(pipeline, auto_na_X) mpg_apply = DataFrame(regressor.apply(Xt), columns = ["nodeId"]) mpg = pandas.concat((mpg, mpg_apply), axis = 1) store_csv(mpg, name)
def build_auto_isotonic(regressor, auto_isotonic_X, name): pipeline = PMMLPipeline([ ("regressor", regressor) ]) pipeline.fit(auto_isotonic_X, auto_y) pipeline.verify(auto_isotonic_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_isotonic_X), columns = ["mpg"]) store_csv(mpg, name)
def build_auto_opt(regressor, name, fit_params = {}, **pmml_options): pipeline = PMMLPipeline([ ("regressor", regressor) ]) pipeline.fit(auto_X[auto_train_mask], auto_y[auto_train_mask], **fit_params) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name)
def build_iris_opt(classifier, name, fit_params = {}, **pmml_options): pipeline = PMMLPipeline([ ("classifier", classifier) ]) pipeline.fit(iris_X[iris_train_mask], iris_y[iris_train_mask], **fit_params) if isinstance(classifier, XGBClassifier): pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13)) store_pkl(pipeline, name) species = DataFrame(pipeline.predict(iris_X), columns = ["Species"]) species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"]) species = pandas.concat((species, species_proba), axis = 1) store_csv(species, name)
def build_auto_na_hist(regressor, name): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["displacement", "horsepower", "weight", "acceleration"]] + [([column], [CategoricalDomain(), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year", "origin"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(auto_na_X, auto_na_y) pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"]) store_csv(mpg, name)
def build_visit(regressor, name): mapper = DataFrameMapper( [(["edlevel"], [CategoricalDomain(), OneHotEncoder()])] + [([bin_column], [CategoricalDomain(), OneHotEncoder()]) for bin_column in ["outwork", "female", "married", "kids", "self"]] + [(["age"], ContinuousDomain())] + [(["hhninc", "educ"], ContinuousDomain())] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(visit_X, visit_y) pipeline.verify(visit_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) docvis = DataFrame(pipeline.predict(visit_X), columns = ["docvis"]) store_csv(docvis, name)
def build_audit_na_direct(classifier, name): mapper = DataFrameMapper([ (["Age", "Hours", "Income"], None), (["Employment", "Education", "Marital", "Occupation", "Gender"], OneHotEncoder()) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ]) pipeline.fit(audit_na_X, audit_na_y) pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"]) adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def build_versicolor_direct(classifier, name, with_proba=True, **pmml_options): transformer = ColumnTransformer( [("all", "passthrough", ["Petal.Length", "Petal.Width"])], remainder="drop") pipeline = PMMLPipeline([("transformer", transformer), ("classifier", classifier)]) pipeline.fit(versicolor_X, versicolor_y) pipeline.configure(**pmml_options) pipeline.verify(versicolor_X.sample(frac=0.10, random_state=13)) store_pkl(pipeline, name) species = DataFrame(pipeline.predict(versicolor_X), columns=["Species"]) if with_proba == True: species_proba = DataFrame(pipeline.predict_proba(versicolor_X), columns=["probability(0)", "probability(1)"]) species = pandas.concat((species, species_proba), axis=1) store_csv(species, name)
def build_audit_na_hist(classifier, name): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] + [([column], [CategoricalDomain(), PMMLLabelBinarizer()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ]) pipeline.fit(audit_na_X, audit_na_y) pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"]) adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def main(input_data, output_data, model_dest): logger = logging.getLogger(__name__) logger.info("Loading input and output data") inputs = pd.read_csv(input_data) outputs = pd.read_csv(output_data) X_train, X_test, y_train, y_test = train_test_split(inputs, outputs, test_size=0.4, random_state=23) model = RandomForestClassifier(verbose=True, n_jobs=-1) logger.info("Fitting model") model = model.fit(X_train, y_train["price_range"].ravel()) logger.info("Saving joblib model") dump(model, model_dest + ".joblib") pipeline = PMMLPipeline([("classifier", RandomForestClassifier(verbose=True, n_jobs=-1))]) pipeline.fit(X_train, y_train["price_range"].ravel()) pipeline.verify(X_test.sample(n=10)) logger.info("Saving PMML model") skl_to_pmml( pipeline, [ "battery_power", "clock_speed", "fc", "int_memory", "m_dep", "mobile_wt", "n_cores", "pc", "px_height", "px_width", "ram", "sc_h", "sc_w", "talk_time", ], "price_range", model_dest + ".pmml", )
def build_audit_h2o(classifier, name): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] + [([column], CategoricalDomain()) for column in ["Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("uploader", H2OFrameCreator()), ("classifier", classifier) ]) pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(), column_types = ["categorical"])) pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13)) classifier = pipeline._final_estimator store_mojo(classifier, name) store_pkl(pipeline, name) adjusted = pipeline.predict(audit_X) adjusted.set_names(["h2o(Adjusted)", "probability(0)", "probability(1)"]) store_csv(adjusted.as_data_frame(), name)
def build_audit_na(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, predict_transformer = None, predict_proba_transformer = None, apply_transformer = None, **pmml_options): employment_mapping = { "CONSULTANT" : "PRIVATE", "PSFEDERAL" : "PUBLIC", "PSLOCAL" : "PUBLIC", "PSSTATE" : "PUBLIC", "SELFEMP" : "PRIVATE", "PRIVATE" : "PRIVATE" } gender_mapping = { "FEMALE" : 0.0, "MALE" : 1.0, "MISSING_VALUE" : 0.5 } mapper = DataFrameMapper( [(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999", dtype = int), name = "flag_missing(Age, -999)"), SimpleImputer(missing_values = -999, strategy = "constant", fill_value = 38)])] + [(["Age"], MissingIndicator())] + [(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), SimpleImputer(missing_values = -999, add_indicator = True)])] + [(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), SimpleImputer(strategy = "median", add_indicator = True)])] + [(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] + [([column], [CategoricalDomain(missing_values = None, missing_value_replacement = "N/A", with_data = False), SimpleImputer(missing_values = "N/A", strategy = "most_frequent"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), SimpleImputer(strategy = "constant"), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ], predict_transformer = predict_transformer, predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer) pipeline.fit(audit_na_X, audit_na_y, **fit_params) pipeline.configure(**pmml_options) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1) store_csv(adjusted, name)
def test_fit_verify(self): pipeline = PMMLPipeline([("estimator", DummyRegressor())]) self.assertFalse(hasattr(pipeline, "active_fields")) self.assertFalse(hasattr(pipeline, "target_fields")) X = DataFrame([[1, 0], [2, 0], [3, 0]], columns=["X1", "X2"]) y = Series([0.5, 1.0, 1.5], name="y") pipeline.fit(X, y) self.assertEqual(["X1", "X2"], pipeline.active_fields.tolist()) self.assertEqual("y", pipeline.target_fields.tolist()) X.columns = ["x1", "x2"] pipeline.fit(X, y) self.assertEqual(["x1", "x2"], pipeline.active_fields.tolist()) self.assertEqual("y", pipeline.target_fields.tolist()) self.assertFalse(hasattr(pipeline, "verification")) pipeline.verify(X.sample(2)) self.assertEqual(2, len(pipeline.verification.active_values)) self.assertEqual(2, len(pipeline.verification.target_values)) X.columns = ["x2", "x1"] with self.assertRaises(ValueError): pipeline.verify(X.sample(2))
def main(input_data, output_data, model_dest): logger = logging.getLogger(__name__) logger.info("Loading input and output data") inputs = pd.read_csv(input_data) outputs = pd.read_csv(output_data) X_train, X_test, y_train, y_test = train_test_split(inputs, outputs, test_size=0.4, random_state=23) logger.info("Create model (tuned hyperparams)") model = RandomForestClassifier( max_depth=8, max_leaf_nodes=64, max_samples=0.5, n_estimators=10, verbose=True, n_jobs=-1, ) logger.info("Fitting model") model = model.fit(X_train, y_train) logger.info("Saving joblib model") dump(model, model_dest + ".joblib") pipeline = PMMLPipeline([("classifier", RandomForestClassifier(verbose=True, n_jobs=-1))]) pipeline.fit(X_train, y_train) pipeline.verify(X_test.sample(n=10)) logger.info("Saving PMML model") # sklearn2pmml(pipeline, model_dest + ".pmml") skl_to_pmml( pipeline, ["Age", "Debt", "YearsEmployed", "Income"], "Approved", model_dest + ".pmml", )
LabelBinarizer() ]), ("Hours", ContinuousDomain()), ("Income", ContinuousDomain()), (["Hours", "Income"], Alias(ExpressionTransformer("X[1] / (X[0] * 52)"), "Hourly_Income")) ]) interaction_mapper = DataFrameMapper([ ("Gender", [CategoricalDomain(), LabelBinarizer()]), ("Marital", [CategoricalDomain(), LabelBinarizer()]) ]) classifier = XGBClassifier() pipeline = PMMLPipeline([ ("mapper", FeatureUnion([("scalar_mapper", scalar_mapper), ("interaction", Pipeline([("interaction_mapper", interaction_mapper), ("polynomial", PolynomialFeatures())]))])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y) pipeline.configure(compact=True) pipeline.verify(audit_X.sample(100), zeroThreshold=1e-6, precision=1e-6) sklearn2pmml(pipeline, "pmml/XGBoostAudit.pmml") if "--deploy" in sys.argv: from openscoring import Openscoring os = Openscoring("http://localhost:8080/openscoring") os.deployFile("XGBoostAudit", "pmml/XGBoostAudit.pmml")
sublinear_tf=True, max_df=0.1, min_df=0.0001, norm=None, tokenizer=Splitter())), ( 'linear', SGDClassifier( # params )), ]) # Train the model pipeline.fit(train_data, train_labels) # Pack verification data and verify pipeline.verify(test_data) # Save the pipeline + model sklearn2pmml.sklearn2pmml(sklearn2pmml.make_pmml_pipeline( pipeline, active_fields=FEATURE_COLS, target_fields=TARGET_COL), 'model.pmml', with_repr=True, debug=True) # Measure the accuracy descision = pipeline.decision_function(test_data.sample(1)) jpmml_input_data = pd.read_csv("jpmml-test-input.csv", usecols=FEATURE_COLS) input_pred = pipeline.predict(jpmml_input_data.squeeze()) with numpy.printoptions(threshold=numpy.inf): print(input_pred)
def __init__(self): self.n_splits = 1 def split(self, X, y, groups=None): yield (numpy.arange(len(X)), numpy.arange(len(y))) def get_n_splits(self, X, y, groups=None): return self.n_splits pipeline = PMMLPipeline([("mapper", mapper), ("ensemble", StackingClassifier([("lightgbm", lightgbm_pipeline), ("xgboost", xgboost_pipeline), ("sklearn", sklearn_pipeline)], final_estimator=final_estimator, cv=DisabledCV(), passthrough=False))]) # pipeline = PMMLPipeline([ # ("mapper", mapper), # ("ensemble", StackingClassifier([ # ("lightgbm", lightgbm_pipeline), ("sklearn", sklearn_pipeline) # ], final_estimator=final_estimator, cv=DisabledCV(), passthrough=False)) # ]) pipeline.fit(df_X, df_y) pipeline.verify(df_X.sample(n=10, random_state=13)) sklearn2pmml(pipeline, "../pmml/StackingEnsembleAudit.pmml")
("Income", ContinuousDomain()), (["Hours", "Income"], Alias(ExpressionTransformer("X[1] / (X[0] * 52)"), "Hourly_Income"))]) classifier = H2ORandomForestEstimator(ntrees=17) predict_proba_transformer = Pipeline([ ("expression", ExpressionTransformer("X[1]")), ("cut", Alias(CutTransformer(bins=[0.0, 0.75, 0.90, 1.0], labels=["no", "maybe", "yes"]), "Decision", prefit=True)) ]) pipeline = PMMLPipeline([("local_mapper", mapper), ("uploader", H2OFrameCreator()), ("remote_classifier", classifier)], predict_proba_transformer=predict_proba_transformer) pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(), column_types=["categorical"])) pipeline.verify(audit_X.sample(100)) sklearn2pmml(pipeline, "pmml/RandomForestAudit.pmml") if "--deploy" in sys.argv: from openscoring import Openscoring os = Openscoring("http://localhost:8080/openscoring") os.deployFile("RandomForestAudit", "pmml/RandomForestAudit.pmml")
store_pkl(pipeline, "SelectFirstIris") species = DataFrame(pipeline.predict(iris_X), columns = ["Species"]) species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"]) species = pandas.concat((species, species_proba), axis = 1) store_csv(species, "SelectFirstIris") if "Iris" in datasets: classifier = RuleSetClassifier([ ("X['Petal.Length'] >= 2.45 and X['Petal.Width'] < 1.75", "versicolor"), ("X['Petal.Length'] >= 2.45", "virginica") ], default_score = "setosa") pipeline = PMMLPipeline([ ("classifier", classifier) ]) pipeline.fit(iris_X, iris_y) pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13)) store_pkl(pipeline, "RuleSetIris") species = DataFrame(pipeline.predict(iris_X), columns = ["Species"]) store_csv(species, "RuleSetIris") # # Text classification # sentiment_X, sentiment_y = load_sentiment("Sentiment") def build_sentiment(classifier, name, with_proba = True, **pmml_options): pipeline = PMMLPipeline([ ("tf-idf", TfidfVectorizer(analyzer = "word", preprocessor = None, strip_accents = None, lowercase = True, token_pattern = None, tokenizer = Splitter(), stop_words = "english", ngram_range = (1, 2), norm = None, dtype = (numpy.float32 if isinstance(classifier, RandomForestClassifier) else numpy.float64))), ("selector", SelectKBest(f_classif, k = 500)), ("classifier", classifier)