def build_auto_na(regressor, name, predict_transformer = None, apply_transformer = None, **pmml_options): mapper = DataFrameMapper( [([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year"]] + [(["origin"], [CategoricalDomain(missing_values = -1), SimpleImputer(missing_values = -1, strategy = "most_frequent"), OneHotEncoder()])] + [(["acceleration"], [ContinuousDomain(missing_values = None), CutTransformer(bins = [5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25], labels = False), CategoricalImputer(), LabelBinarizer()])] + [(["displacement"], [ContinuousDomain(missing_values = None), SimpleImputer(), CutTransformer(bins = [0, 100, 200, 300, 400, 500], labels = ["XS", "S", "M", "L", "XL"]), LabelBinarizer()])] + [(["horsepower"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 50, high_value = 225), SimpleImputer(strategy = "median")])] + [(["weight"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 2000, high_value = 5000), SimpleImputer(strategy = "median")])] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ], predict_transformer = predict_transformer, apply_transformer = apply_transformer) pipeline.fit(auto_na_X, auto_na_y) if isinstance(regressor, DecisionTreeRegressor): tree = regressor.tree_ node_impurity = {node_idx : tree.impurity[node_idx] for node_idx in range(0, tree.node_count) if tree.impurity[node_idx] != 0.0} pmml_options["node_extensions"] = {regressor.criterion : node_impurity} pipeline.configure(**pmml_options) pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"]) if isinstance(regressor, DecisionTreeRegressor): Xt = pipeline_transform(pipeline, auto_na_X) mpg_apply = DataFrame(regressor.apply(Xt), columns = ["nodeId"]) mpg = pandas.concat((mpg, mpg_apply), axis = 1) store_csv(mpg, name)
def build_auto_na(regressor, name): mapper = DataFrameMapper( [([column], [ CategoricalDomain(missing_values=-1), CategoricalImputer(missing_values=-1), PMMLLabelBinarizer() ]) for column in ["cylinders", "model_year"]] + [(["origin"], [CategoricalImputer(missing_values=-1), OneHotEncoder()])] + [(["acceleration"], [ ContinuousDomain(missing_values=None), CutTransformer(bins=[5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25], labels=False), CategoricalImputer(), LabelBinarizer() ])] + [(["displacement"], [ ContinuousDomain(missing_values=None), Imputer(), CutTransformer(bins=[0, 100, 200, 300, 400, 500], labels=["XS", "S", "M", "L", "XL"]), LabelBinarizer() ])] + [(["horsepower"], [ ContinuousDomain(missing_values=None, outlier_treatment="as_extreme_values", low_value=50, high_value=225), Imputer() ])] + [(["weight"], [ ContinuousDomain(missing_values=None, outlier_treatment="as_extreme_values", low_value=2000, high_value=5000), Imputer() ])]) pipeline = PMMLPipeline([("mapper", mapper), ("regressor", regressor)]) pipeline.fit(auto_na_X, auto_na_y) if isinstance(regressor, DecisionTreeRegressor): tree = regressor.tree_ node_impurity = { node_idx: tree.impurity[node_idx] for node_idx in range(0, tree.node_count) if tree.impurity[node_idx] != 0.0 } pipeline.configure( node_extensions={regressor.criterion: node_impurity}) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_na_X), columns=["mpg"]) store_csv(mpg, name + ".csv")
def build_audit_cat(classifier, name, with_proba = True, fit_params = {}): marital_mapping = { "Married-spouse-absent" : "Married" } mapper = DataFrameMapper( [([column], ContinuousDomain(display_name = column)) for column in ["Age", "Income"]] + [(["Hours"], [ContinuousDomain(display_name = "Hours"), CutTransformer(bins = [0, 20, 40, 60, 80, 100], labels = False, right = False, include_lowest = True)])] + [(["Employment", "Education"], [MultiDomain([CategoricalDomain(display_name = "Employment"), CategoricalDomain(display_name = "Education")]), OrdinalEncoder(dtype = numpy.int_)])] + [(["Marital"], [CategoricalDomain(display_name = "Marital"), FilterLookupTransformer(marital_mapping), OrdinalEncoder(dtype = numpy.uint16)])] + [(["Occupation"], [CategoricalDomain(display_name = "Occupation"), OrdinalEncoder(dtype = numpy.float_)])] + [([column], [CategoricalDomain(display_name = column), LabelEncoder()]) for column in ["Gender", "Deductions"]] ) pipeline = Pipeline([ ("mapper", mapper), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y, **fit_params) pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name) pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def build_audit_cat(classifier, name, with_proba=True, **fit_params): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["Age", "Income"]] + [(["Hours"], [ ContinuousDomain(), CutTransformer(bins=[0, 20, 40, 60, 80, 100], labels=False, right=False, include_lowest=True) ])] + [([column], [CategoricalDomain(), LabelEncoder()]) for column in [ "Employment", "Education", "Marital", "Occupation", "Gender", "Deductions" ]]) pipeline = Pipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_X, audit_y, **fit_params) pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame( pipeline.predict_proba(audit_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
tree = regressor.tree_ node_impurity = {node_idx : tree.impurity[node_idx] for node_idx in range(0, tree.node_count) if tree.impurity[node_idx] != 0.0} pmml_options["node_extensions"] = {regressor.criterion : node_impurity} pipeline.configure(**pmml_options) pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"]) if isinstance(regressor, DecisionTreeRegressor): Xt = pipeline_transform(pipeline, auto_na_X) mpg_apply = DataFrame(regressor.apply(Xt), columns = ["nodeId"]) mpg = pandas.concat((mpg, mpg_apply), axis = 1) store_csv(mpg, name) if "Auto" in datasets: build_auto_na(DecisionTreeRegressor(min_samples_leaf = 2, random_state = 13), "DecisionTreeAutoNA", apply_transformer = Alias(ExpressionTransformer("X[0] - 1"), "eval(nodeId)", prefit = True), winner_id = True) build_auto_na(LinearRegression(), "LinearRegressionAutoNA", predict_transformer = CutTransformer(bins = [0, 10, 20, 30, 40], labels = ["0-10", "10-20", "20-30", "30-40"])) auto_na_X, auto_na_y = load_auto("AutoNA") auto_na_X["cylinders"] = auto_na_X["cylinders"].astype("Int64") auto_na_X["model_year"] = auto_na_X["model_year"].astype("Int64") auto_na_X["origin"] = auto_na_X["origin"].astype("Int64") def build_auto_na_hist(regressor, name): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["displacement", "horsepower", "weight", "acceleration"]] + [([column], [CategoricalDomain(), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year", "origin"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor)
def test_transform(self): bins = [float("-inf"), -1.0, 0.0, 1.0, float("+inf")] transformer = CutTransformer(bins, labels=False, right=True) X = numpy.array([-2.0, -1.0, -0.5, 0.0, 0.5, 1.0, 2.0]) self.assertEqual([[0], [0], [1], [1], [2], [2], [3]], transformer.transform(X).tolist()) transformer = CutTransformer(bins, labels=False, right=False) self.assertEqual([[0], [1], [1], [2], [2], [3], [3]], transformer.transform(X).tolist()) bins = [-3.0, -1.0, 1.0, 3.0] transformer = CutTransformer(bins, labels=False, right=True, include_lowest=True) X = numpy.array([-3.0, -2.0, 2.0, 3.0]) self.assertEqual([[0], [0], [2], [2]], transformer.transform(X).tolist()) X = numpy.array([-5.0]) self.assertTrue(numpy.isnan(transformer.transform(X)).tolist()[0]) X = numpy.array([5.0]) self.assertTrue(numpy.isnan(transformer.transform(X)).tolist()[0])
mpg_apply = DataFrame(regressor.apply(Xt), columns=["nodeId"]) mpg = pandas.concat((mpg, mpg_apply), axis=1) store_csv(mpg, name + ".csv") if "Auto" in datasets: build_auto_na(DecisionTreeRegressor(random_state=13, min_samples_leaf=2), "DecisionTreeAutoNA", apply_transformer=Alias(ExpressionTransformer("X[:, 0] - 1"), "eval(nodeId)", prefit=True), winner_id=True) build_auto_na(LinearRegression(), "LinearRegressionAutoNA", predict_transformer=CutTransformer( bins=[0, 10, 20, 30, 40], labels=["0-10", "10-20", "20-30", "30-40"])) housing_X, housing_y = load_housing("Housing.csv") def build_housing(regressor, name, with_kneighbors=False, **pmml_options): mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())]) pipeline = Pipeline([("mapper", mapper), ("transformer-pipeline", Pipeline([ ("polynomial", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)), ("scaler", StandardScaler()),
scalar_mapper = DataFrameMapper([ ("Education", [CategoricalDomain(), LabelBinarizer(), SelectKBest(chi2, k=3)]), ("Employment", [CategoricalDomain(), LabelBinarizer(), SelectKBest(chi2, k=3)]), ("Occupation", [CategoricalDomain(), LabelBinarizer(), SelectKBest(chi2, k=3)]), ("Age", [ ContinuousDomain(), CutTransformer(bins=[17, 28, 37, 47, 83], labels=["q1", "q2", "q3", "q4"]), LabelBinarizer() ]), ("Hours", ContinuousDomain()), ("Income", ContinuousDomain()), (["Hours", "Income"], Alias(ExpressionTransformer("X[1] / (X[0] * 52)"), "Hourly_Income")) ]) interaction_mapper = DataFrameMapper([ ("Gender", [CategoricalDomain(), LabelBinarizer()]), ("Marital", [CategoricalDomain(), LabelBinarizer()]) ]) classifier = XGBClassifier() pipeline = PMMLPipeline([ ("mapper", FeatureUnion([("scalar_mapper", scalar_mapper), ("interaction",
audit_df = pandas.read_csv("csv/Audit.csv") #print(audit_df.head(5)) audit_X = audit_df[audit_df.columns.difference(["Adjusted"])] audit_y = audit_df["Adjusted"] h2o.init() mapper = DataFrameMapper([("Education", CategoricalDomain()), ("Employment", CategoricalDomain()), ("Gender", CategoricalDomain()), ("Marital", CategoricalDomain()), ("Occupation", CategoricalDomain()), ("Age", [ ContinuousDomain(), CutTransformer(bins=[17, 28, 37, 47, 83], labels=["q1", "q2", "q3", "q4"]) ]), ("Hours", ContinuousDomain()), ("Income", ContinuousDomain()), (["Hours", "Income"], Alias(ExpressionTransformer("X[1] / (X[0] * 52)"), "Hourly_Income"))]) classifier = H2ORandomForestEstimator(ntrees=17) predict_proba_transformer = Pipeline([ ("expression", ExpressionTransformer("X[1]")), ("cut", Alias(CutTransformer(bins=[0.0, 0.75, 0.90, 1.0], labels=["no", "maybe", "yes"]), "Decision", prefit=True)) ])