def build_audit_cat(classifier, name, with_proba=True, **fit_params): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["Age", "Income"]] + [(["Hours"], [ ContinuousDomain(), CutTransformer(bins=[0, 20, 40, 60, 80, 100], labels=False, right=False, include_lowest=True) ])] + [([column], [CategoricalDomain(), LabelEncoder()]) for column in [ "Employment", "Education", "Marital", "Occupation", "Gender", "Deductions" ]]) pipeline = Pipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_X, audit_y, **fit_params) pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame( pipeline.predict_proba(audit_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
def test_fit_float_outlier(self): domain = clone( ContinuousDomain(outlier_treatment="as_missing_values", low_value=0.0, high_value=3.0, missing_values=float("NaN"), missing_value_replacement=1.0)) self.assertEqual(0.0, domain.low_value) self.assertEqual(3.0, domain.high_value) X = DataFrame([[-2.0, float("NaN")], [2.0, 4.0], [float("NaN"), 0.0]]) self.assertEqual([[False, True], [False, False], [True, False]], domain._missing_value_mask(X).values.tolist()) self.assertEqual([[True, False], [False, True], [False, False]], domain._outlier_mask(X).values.tolist()) Xt = domain.fit_transform(X) self.assertEqual([1.0, 2.0, 1.0], Xt[0].tolist()) self.assertEqual([1.0, 1.0, 0.0], Xt[1].tolist()) domain = clone( ContinuousDomain(outlier_treatment="as_extreme_values", low_value=0.0, high_value=3.0, missing_values=-1.0)) X = DataFrame([[-2.0, -1.0], [2.0, 4.0], [-1.0, 0.0]]) self.assertEqual([[False, True], [False, False], [True, False]], domain._missing_value_mask(X).values.tolist()) self.assertEqual([[True, False], [False, True], [False, False]], domain._outlier_mask(X).values.tolist()) self.assertEqual([[True, False], [False, False], [False, False]], domain._negative_outlier_mask(X).values.tolist()) self.assertEqual([[False, False], [False, True], [False, False]], domain._positive_outlier_mask(X).values.tolist()) Xt = domain.fit_transform(X) self.assertEqual([0.0, 2.0, -1.0], X[0].tolist()) self.assertEqual([-1.0, 3.0, 0.0], X[1].tolist())
def build_audit(classifier, name, with_proba = True, **kwargs): continuous_mapper = DataFrameMapper([ ("Age", ContinuousDomain()), ("Income", ContinuousDomain()), ("Hours", ContinuousDomain()) ]) categorical_mapper = DataFrameMapper([ ("Employment", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(DecisionTreeClassifier(random_state = 13))))]), ("Education", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(RandomForestClassifier(random_state = 13, n_estimators = 3)), threshold = "1.25 * mean"))]), ("Marital", [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]), ("Occupation", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectKBest(k = 3))]), ("Gender", [CategoricalDomain(), LabelBinarizer(neg_label = -3, pos_label = 3)]), ("Deductions", [CategoricalDomain(), LabelEncoder()]), ]) pipeline = PMMLPipeline([ ("union", FeatureUnion([ ("continuous", continuous_mapper), ("categorical", Pipeline([ ("mapper", categorical_mapper), ("polynomial", PolynomialFeatures()) ])) ])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y) customize(classifier, **kwargs) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"]) if(with_proba == True): adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name + ".csv")
def build_audit(classifier, name, with_proba=True): mapper = DataFrameMapper([ ("Age", ContinuousDomain()), ("Employment", [ LabelBinarizer(), SelectFromModel(EstimatorProxy( DecisionTreeClassifier(random_state=13)), threshold="1.25 * mean") ]), ("Education", [ LabelBinarizer(), SelectorProxy( SelectFromModel(EstimatorProxy( RandomForestClassifier(random_state=13, n_estimators=3)), threshold="median")) ]), ("Marital", [LabelBinarizer(), SelectKBest(k=3)]), ("Occupation", [LabelBinarizer(), SelectorProxy(SelectKBest(k=3))]), ("Income", ContinuousDomain()), ("Gender", LabelEncoder()), ("Deductions", LabelEncoder()), ("Hours", ContinuousDomain()) ]) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_X, audit_y) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"]) if (with_proba == True): adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns=["probability_0", "probability_1"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options): cylinders_origin_mapping = { (8, 1) : "8/1", (6, 1) : "6/1", (4, 1) : "4/1", (6, 2) : "6/2", (4, 2) : "4/2", (4, 3) : "4/3" } mapper = DataFrameMapper([ (["cylinders"], [CategoricalDomain(), Alias(ExpressionTransformer("X[0] % 2.0 > 0.0", dtype = numpy.int8), name = "odd(cylinders)", prefit = True)]), (["cylinders", "origin"], [MultiDomain([None, CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), OneHotEncoder()]), (["model_year"], [CategoricalDomain(), CastTransformer(str), ExpressionTransformer("'19' + X[0] + '-01-01'"), CastTransformer("datetime64[D]"), DaysSinceYearTransformer(1977), Binarizer(threshold = 0)], {"alias" : "bin(model_year, 1977)"}), (["model_year", "origin"], [ConcatTransformer("/"), OneHotEncoder(sparse = False), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]), (["weight", "displacement"], [ContinuousDomain(), ExpressionTransformer("(X[0] / X[1]) + 0.5", dtype = numpy.float64)], {"alias" : "weight / displacement + 0.5"}), (["displacement", "horsepower", "weight", "acceleration"], [MultiDomain([None, ContinuousDomain(), None, ContinuousDomain()]), StandardScaler()]) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y, **fit_params) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"]) store_csv(mpg, name)
def test_fit_float_missing(self): domain = ContinuousDomain(missing_values=-1.0, missing_value_replacement=4.0) domain = clone(domain) self.assertEqual(-1.0, domain.missing_values) self.assertEqual(4.0, domain.missing_value_replacement) self.assertFalse(domain._empty_fit()) X = DataFrame([1.0, -1.0, 3.0, 2.0, -1.0, 2.0]) Xt = domain.fit_transform(X) self.assertIsInstance(Xt, DataFrame) self.assertEqual(1.0, domain.data_min_) self.assertEqual(3.0, domain.data_max_) self.assertEqual({ "totalFreq": 6, "missingFreq": 2, "invalidFreq": 0 }, domain.counts_) self.assertEqual( { "minimum": [1.0], "maximum": [3.0], "mean": [2.0], "standardDeviation": [0.7071067811865476], "median": [2.0], "interQuartileRange": [0.5] }, _array_to_list(domain.numeric_info_)) self.assertEqual([1.0, 4.0, 3.0, 2.0, 4.0, 2.0], Xt[0].tolist()) X = numpy.array([-1.0, -1.0]) Xt = domain.transform(X) self.assertEqual([4.0, 4.0], Xt.tolist())
def build_audit_cat(classifier, name, with_proba = True, fit_params = {}): marital_mapping = { "Married-spouse-absent" : "Married" } mapper = DataFrameMapper( [([column], ContinuousDomain(display_name = column)) for column in ["Age", "Income"]] + [(["Hours"], [ContinuousDomain(display_name = "Hours"), CutTransformer(bins = [0, 20, 40, 60, 80, 100], labels = False, right = False, include_lowest = True)])] + [(["Employment", "Education"], [MultiDomain([CategoricalDomain(display_name = "Employment"), CategoricalDomain(display_name = "Education")]), OrdinalEncoder(dtype = numpy.int_)])] + [(["Marital"], [CategoricalDomain(display_name = "Marital"), FilterLookupTransformer(marital_mapping), OrdinalEncoder(dtype = numpy.uint16)])] + [(["Occupation"], [CategoricalDomain(display_name = "Occupation"), OrdinalEncoder(dtype = numpy.float_)])] + [([column], [CategoricalDomain(display_name = column), LabelEncoder()]) for column in ["Gender", "Deductions"]] ) pipeline = Pipeline([ ("mapper", mapper), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y, **fit_params) pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name) pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def build_auto_na(regressor, name, predict_transformer = None, apply_transformer = None, **pmml_options): mapper = DataFrameMapper( [([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year"]] + [(["origin"], [CategoricalDomain(missing_values = -1), SimpleImputer(missing_values = -1, strategy = "most_frequent"), OneHotEncoder()])] + [(["acceleration"], [ContinuousDomain(missing_values = None), CutTransformer(bins = [5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25], labels = False), CategoricalImputer(), LabelBinarizer()])] + [(["displacement"], [ContinuousDomain(missing_values = None), SimpleImputer(), CutTransformer(bins = [0, 100, 200, 300, 400, 500], labels = ["XS", "S", "M", "L", "XL"]), LabelBinarizer()])] + [(["horsepower"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 50, high_value = 225), SimpleImputer(strategy = "median")])] + [(["weight"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 2000, high_value = 5000), SimpleImputer(strategy = "median")])] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ], predict_transformer = predict_transformer, apply_transformer = apply_transformer) pipeline.fit(auto_na_X, auto_na_y) if isinstance(regressor, DecisionTreeRegressor): tree = regressor.tree_ node_impurity = {node_idx : tree.impurity[node_idx] for node_idx in range(0, tree.node_count) if tree.impurity[node_idx] != 0.0} pmml_options["node_extensions"] = {regressor.criterion : node_impurity} pipeline.configure(**pmml_options) pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"]) if isinstance(regressor, DecisionTreeRegressor): Xt = pipeline_transform(pipeline, auto_na_X) mpg_apply = DataFrame(regressor.apply(Xt), columns = ["nodeId"]) mpg = pandas.concat((mpg, mpg_apply), axis = 1) store_csv(mpg, name)
def test_fit_float(self): domain = clone(ContinuousDomain(with_data = False, with_statistics = False)) self.assertTrue(domain._empty_fit()) domain = clone(ContinuousDomain(missing_values = float("NaN"), missing_value_treatment = "as_value", missing_value_replacement = -1.0, invalid_value_treatment = "as_is", invalid_value_replacement = 0.0)) self.assertTrue(numpy.isnan(domain.missing_values)) self.assertEqual("as_value", domain.missing_value_treatment) self.assertEqual(-1.0, domain.missing_value_replacement) self.assertEqual("as_is", domain.invalid_value_treatment) self.assertEqual(0.0, domain.invalid_value_replacement) self.assertFalse(hasattr(domain, "data_min_")) self.assertFalse(hasattr(domain, "data_max_")) self.assertFalse(hasattr(domain, "counts_")) self.assertFalse(hasattr(domain, "numeric_info_")) self.assertFalse(domain._empty_fit()) X = DataFrame(numpy.array([1.0, float("NaN"), 3.0, 2.0, float("NaN"), 2.0])) Xt = domain.fit_transform(X) self.assertIsInstance(Xt, DataFrame) self.assertEqual(1.0, domain.data_min_) self.assertEqual(3.0, domain.data_max_) self.assertEqual({"totalFreq" : 6, "missingFreq" : 2, "invalidFreq" : 0}, domain.counts_) self.assertEqual({"minimum" : [1.0], "maximum" : [3.0], "mean" : [2.0], "standardDeviation" : [0.7071067811865476], "median" : [2.0], "interQuartileRange" : [0.5]}, _array_to_list(domain.numeric_info_)) self.assertEqual([1.0, -1.0, 3.0, 2.0, -1.0, 2.0], Xt[0].tolist()) X = numpy.array([float("NaN"), None]) Xt = domain.transform(X) self.assertEqual([-1.0, -1.0], Xt.tolist())
def test_fit_float(self): domain = ContinuousDomain() self.assertEqual("return_invalid", domain.invalid_value_treatment) self.assertFalse(hasattr(domain, "data_min_")) self.assertFalse(hasattr(domain, "data_max_")) domain = domain.fit(numpy.array([1.0, float('NaN'), 3.0, 2.0, float('NaN'), 2.0])) self.assertEqual(1.0, domain.data_min_) self.assertEqual(3.0, domain.data_max_)
def build_auto_na(regressor, name): mapper = DataFrameMapper( [([column], [ CategoricalDomain(missing_values=-1), CategoricalImputer(missing_values=-1), PMMLLabelBinarizer() ]) for column in ["cylinders", "model_year"]] + [(["origin"], [CategoricalImputer(missing_values=-1), OneHotEncoder()])] + [(["acceleration"], [ ContinuousDomain(missing_values=None), CutTransformer(bins=[5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25], labels=False), CategoricalImputer(), LabelBinarizer() ])] + [(["displacement"], [ ContinuousDomain(missing_values=None), Imputer(), CutTransformer(bins=[0, 100, 200, 300, 400, 500], labels=["XS", "S", "M", "L", "XL"]), LabelBinarizer() ])] + [(["horsepower"], [ ContinuousDomain(missing_values=None, outlier_treatment="as_extreme_values", low_value=50, high_value=225), Imputer() ])] + [(["weight"], [ ContinuousDomain(missing_values=None, outlier_treatment="as_extreme_values", low_value=2000, high_value=5000), Imputer() ])]) pipeline = PMMLPipeline([("mapper", mapper), ("regressor", regressor)]) pipeline.fit(auto_na_X, auto_na_y) if isinstance(regressor, DecisionTreeRegressor): tree = regressor.tree_ node_impurity = { node_idx: tree.impurity[node_idx] for node_idx in range(0, tree.node_count) if tree.impurity[node_idx] != 0.0 } pipeline.configure( node_extensions={regressor.criterion: node_impurity}) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_na_X), columns=["mpg"]) store_csv(mpg, name + ".csv")
def make_pipeline_model(numeric_feature, category_feature, estimator, X=None, y=None): ''' 通过指定类别型和数值型特征构建以及指定的模型构建pipeline,如果给出数据集就完成训练,最终返回pipeline模型 numeric_feature: 数值特征 list category_feature: 类别特征 list X:X数据 传入pandas.DataFrame对象 y:Y数据 传入pandas.Series对象 return: pipeline_model ''' feature_def = gen_features( columns=category_feature, classes=[CategoricalDomain, CategoricalImputer, LabelBinarizer]) mapper_numerical = DataFrameMapper([(numeric_feature, [ ContinuousDomain(), SimpleImputer(strategy='mean'), StandardScaler() ])]) mapper_category = DataFrameMapper(feature_def) mapper = FeatureUnion([('mapper_numerical', mapper_numerical), ('mapper_category', mapper_category)]) pipeline_model = PMMLPipeline([('mapper', mapper), ('classifier', estimator)]) if X is not None and y is not None: pipeline_model.fit(X, y) return pipeline_model
def build_auto_h2o(regressor, name): transformer = ColumnTransformer( [(column, CategoricalDomain(), [column]) for column in ["cylinders", "model_year", "origin"]] + [(column, ContinuousDomain(), [column]) for column in ["displacement", "horsepower", "weight", "acceleration"]]) pipeline = PMMLPipeline([("transformer", transformer), ("uploader", H2OFrameCreator(column_names=[ "cylinders", "model_year", "origin", "displacement", "horsepower", "weight", "acceleration" ], column_types=[ "enum", "enum", "enum", "numeric", "numeric", "numeric", "numeric" ])), ("regressor", regressor)]) pipeline.fit(auto_X, H2OFrame(auto_y.to_frame())) pipeline.verify(auto_X.sample(frac=0.05, random_state=13)) regressor = pipeline._final_estimator store_mojo(regressor, name + ".zip") store_pkl(pipeline, name + ".pkl") mpg = pipeline.predict(auto_X) mpg.set_names(["mpg"]) store_csv(mpg.as_data_frame(), name + ".csv")
def build_housing(regressor, name, with_kneighbors=False, **kwargs): mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())]) pipeline = Pipeline([("mapper", mapper), ("transformer-pipeline", Pipeline([ ("polynomial", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)), ("scaler", StandardScaler()), ("selector", SelectPercentile(score_func=f_regression, percentile=35)), ])), ("regressor", regressor)]) pipeline.fit(housing_X, housing_y) pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values, housing_y.name) customize(regressor, **kwargs) store_pkl(pipeline, name + ".pkl") medv = DataFrame(pipeline.predict(housing_X), columns=["MEDV"]) if (with_kneighbors == True): Xt = pipeline_transform(pipeline, housing_X) kneighbors = regressor.kneighbors(Xt) medv_ids = DataFrame(kneighbors[1] + 1, columns=[ "neighbor(" + str(x + 1) + ")" for x in range(regressor.n_neighbors) ]) medv = pandas.concat((medv, medv_ids), axis=1) store_csv(medv, name + ".csv")
def build_iris(classifier, name, with_proba=True, **kwargs): pipeline = Pipeline([ ("pipeline", Pipeline([("domain", ContinuousDomain()), ("transform", FeatureUnion([("normal_scale", FunctionTransformer(None)), ("log_scale", FunctionTransformer(numpy.log10))]))])), ("scaler", RobustScaler()), ("pca", IncrementalPCA(n_components=3, whiten=True)), ("classifier", classifier) ]) pipeline.fit(iris_X, iris_y) pipeline = make_pmml_pipeline(pipeline, iris_X.columns.values, iris_y.name) customize(classifier, **kwargs) store_pkl(pipeline, name + ".pkl") species = DataFrame(pipeline.predict(iris_X), columns=["Species"]) if (with_proba == True): species_proba = DataFrame(pipeline.predict_proba(iris_X), columns=[ "probability(setosa)", "probability(versicolor)", "probability(virginica)" ]) species = pandas.concat((species, species_proba), axis=1) store_csv(species, name + ".csv")
def build_iris(classifier, name, **pmml_options): cont_columns = [ "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width" ] cont_mappings = [([cont_column], ContinuousDomain()) for cont_column in cont_columns] mapper = DataFrameMapper(cont_mappings) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(iris_X, iris_y) if isinstance(classifier, XGBClassifier): pipeline.verify(iris_X.sample(n=3, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(iris_X.sample(n=3, random_state=13)) pipeline.configure(**pmml_options) store_pmml(pipeline, name) species = DataFrame(pipeline.predict(iris_X), columns=["Species"]) species_proba = DataFrame(pipeline.predict_proba(iris_X), columns=[ "probability(setosa)", "probability(versicolor)", "probability(virginica)" ]) store_csv(pandas.concat((species, species_proba), axis=1), name)
def build_auto(regressor, name, **kwargs): mapper = DataFrameMapper([ (["cylinders"], CategoricalDomain()), (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]), (["model_year"], [CategoricalDomain(), Binarizer(threshold=77)], { "alias": "bin(model_year, 77)" }), # Pre/post 1973 oil crisis effects (["origin"], OneHotEncoder()), (["weight", "displacement"], ExpressionTransformer("(X[:, 0] / X[:, 1]) + 0.5"), { "alias": "weight / displacement + 0.5" }) ]) pipeline = Pipeline([("mapper", mapper), ("regressor", regressor)]) pipeline.fit(auto_X, auto_y) pipeline = make_pmml_pipeline(pipeline, auto_X.columns.values, auto_y.name) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac=0.05, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(auto_X.sample(frac=0.05, random_state=13)) customize(regressor, **kwargs) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"]) store_csv(mpg, name + ".csv")
def build_auto(regressor, name, **pmml_options): cylinders_origin_mapping = { (8, 1) : "8/1", (6, 1) : "6/1", (4, 1) : "4/1", (6, 2) : "6/2", (4, 2) : "4/2", (6, 3) : "6/3", (4, 3) : "4/3" } mapper = DataFrameMapper([ (["cylinders", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), LabelBinarizer()]), (["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]), (["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {"alias" : "weight / displacement + 0.5"}) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name)
def build_wheat(kmeans, name, with_affinity = True, **pmml_options): mapper = DataFrameMapper([ (wheat_X.columns.values, [ContinuousDomain(dtype = float), IdentityTransformer()]) ]) scaler = ColumnTransformer([ ("robust", RobustScaler(), [0, 5]) ], remainder = MinMaxScaler()) pipeline = Pipeline([ ("mapper", mapper), ("scaler", scaler), ("clusterer", kmeans) ]) pipeline.fit(wheat_X) pipeline = make_pmml_pipeline(pipeline, wheat_X.columns.values) pipeline.configure(**pmml_options) store_pkl(pipeline, name) cluster = DataFrame(pipeline.predict(wheat_X), columns = ["Cluster"]) if with_affinity == True: Xt = pipeline_transform(pipeline, wheat_X) affinity_0 = kmeans_distance(kmeans, 0, Xt) affinity_1 = kmeans_distance(kmeans, 1, Xt) affinity_2 = kmeans_distance(kmeans, 2, Xt) cluster_affinity = DataFrame(numpy.transpose([affinity_0, affinity_1, affinity_2]), columns = ["affinity(0)", "affinity(1)", "affinity(2)"]) cluster = pandas.concat((cluster, cluster_affinity), axis = 1) store_csv(cluster, name)
def build_iris(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, **pmml_options): pipeline = Pipeline([ ("pipeline", Pipeline([ ("mapper", DataFrameMapper([ (iris_X.columns.values, ContinuousDomain()), (["Sepal.Length", "Petal.Length"], Aggregator(function = "mean")), (["Sepal.Width", "Petal.Width"], Aggregator(function = "mean")) ])), ("transform", FeatureUnion([ ("normal_scale", FunctionTransformer(None, validate = True)), ("log_scale", FunctionTransformer(numpy.log10, validate = True)), ("power_scale", PowerFunctionTransformer(power = 2)) ])) ])), ("pca", IncrementalPCA(n_components = 3, whiten = True)), ("classifier", classifier) ]) pipeline.fit(iris_X, iris_y, **fit_params) pipeline = make_pmml_pipeline(pipeline, iris_X.columns.values, iris_y.name) pipeline.configure(**pmml_options) if isinstance(classifier, XGBClassifier): pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params) store_pkl(pipeline, name) species = DataFrame(pipeline.predict(iris_X, **predict_params), columns = ["Species"]) if with_proba == True: species_proba = DataFrame(pipeline.predict_proba(iris_X, **predict_proba_params), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"]) species = pandas.concat((species, species_proba), axis = 1) store_csv(species, name)
def build_housing(regressor, name, with_kneighbors = False, **pmml_options): mapper = DataFrameMapper([ (housing_X.columns.values, ContinuousDomain()) ]) pipeline = Pipeline([ ("mapper", mapper), ("transformer-pipeline", Pipeline([ ("polynomial", PolynomialFeatures(degree = 2, interaction_only = True, include_bias = False)), ("scaler", StandardScaler()), ("passthrough-transformer", "passthrough"), ("selector", SelectPercentile(score_func = f_regression, percentile = 35)), ("passthrough-final-estimator", "passthrough") ])), ("regressor", regressor) ]) pipeline.fit(housing_X, housing_y) pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values, housing_y.name) pipeline.configure(**pmml_options) pipeline.verify(housing_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) medv = DataFrame(pipeline.predict(housing_X), columns = ["MEDV"]) if with_kneighbors == True: Xt = pipeline_transform(pipeline, housing_X) kneighbors = regressor.kneighbors(Xt) medv_ids = DataFrame(kneighbors[1] + 1, columns = ["neighbor(" + str(x + 1) + ")" for x in range(regressor.n_neighbors)]) medv = pandas.concat((medv, medv_ids), axis = 1) store_csv(medv, name)
def build_audit(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, **pmml_options): continuous_mapper = DataFrameMapper([ (["Age", "Income", "Hours"], MultiDomain([ContinuousDomain() for i in range(0, 3)])) ]) categorical_mapper = DataFrameMapper([ (["Employment"], [CategoricalDomain(), SubstringTransformer(0, 3), OneHotEncoder(drop = ["Vol"]), SelectFromModel(DecisionTreeClassifier(random_state = 13))]), (["Education"], [CategoricalDomain(), ReplaceTransformer("[aeiou]", ""), OneHotEncoder(drop = "first"), SelectFromModel(RandomForestClassifier(n_estimators = 3, random_state = 13), threshold = "1.25 * mean")]), (["Marital"], [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]), (["Occupation"], [CategoricalDomain(), LabelBinarizer(), SelectKBest(k = 3)]), (["Gender"], [CategoricalDomain(), MatchesTransformer("^Male$"), CastTransformer(int)]), (["Deductions"], [CategoricalDomain()]), ]) pipeline = Pipeline([ ("union", FeatureUnion([ ("continuous", continuous_mapper), ("categorical", Pipeline([ ("mapper", categorical_mapper), ("polynomial", PolynomialFeatures()) ])) ])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y, **fit_params) pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name) pipeline.configure(**pmml_options) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X, **predict_params), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def build_versicolor(classifier, name, with_proba = True, **pmml_options): transformer = ColumnTransformer([ ("continuous_columns", Pipeline([ ("domain", ContinuousDomain()), ("scaler", RobustScaler()) ]), versicolor_X.columns.values) ]) pipeline = Pipeline([ ("transformer", transformer), ("transformer-selector-pipeline", Pipeline([ ("polynomial", PolynomialFeatures(degree = 3)), ("selector", SelectKBest(k = "all")) ])), ("classifier", classifier) ]) pipeline.fit(versicolor_X, versicolor_y) pipeline = make_pmml_pipeline(pipeline, versicolor_X.columns.values, versicolor_y.name) pipeline.configure(**pmml_options) pipeline.verify(versicolor_X.sample(frac = 0.10, random_state = 13)) store_pkl(pipeline, name) species = DataFrame(pipeline.predict(versicolor_X), columns = ["Species"]) if with_proba == True: species_proba = DataFrame(pipeline.predict_proba(versicolor_X), columns = ["probability(0)", "probability(1)"]) species = pandas.concat((species, species_proba), axis = 1) store_csv(species, name)
def build_visit(regressor, name): mapper = DataFrameMapper( [(["edlevel"], [CategoricalDomain(), OneHotEncoder()])] + [([bin_column], [CategoricalDomain(), OneHotEncoder()]) for bin_column in ["outwork", "female", "married", "kids", "self"]] + [(["age"], ContinuousDomain())] + [(["hhninc", "educ"], ContinuousDomain())] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(visit_X, visit_y) pipeline.verify(visit_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) docvis = DataFrame(pipeline.predict(visit_X), columns = ["docvis"]) store_csv(docvis, name)
def build_audit(classifier, name, with_proba=True, **kwargs): continuous_mapper = DataFrameMapper([ (["Age", "Income", "Hours"], MultiDomain([ContinuousDomain() for i in range(0, 3)])) ]) categorical_mapper = DataFrameMapper([ (["Employment"], [ CategoricalDomain(), LabelBinarizer(), SelectFromModel(DecisionTreeClassifier(random_state=13)) ]), (["Education"], [ CategoricalDomain(), LabelBinarizer(), SelectFromModel(RandomForestClassifier(random_state=13, n_estimators=3), threshold="1.25 * mean") ]), (["Marital"], [ CategoricalDomain(), LabelBinarizer(neg_label=-1, pos_label=1), SelectKBest(k=3) ]), (["Occupation"], [CategoricalDomain(), LabelBinarizer(), SelectKBest(k=3)]), (["Gender"], [CategoricalDomain(), LabelBinarizer(neg_label=-3, pos_label=3)]), (["Deductions"], [CategoricalDomain(), LabelEncoder()]), ]) pipeline = Pipeline([ ("union", FeatureUnion([("continuous", continuous_mapper), ("categorical", Pipeline([("mapper", categorical_mapper), ("polynomial", PolynomialFeatures())]))])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y) pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_X.sample(frac=0.05, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(audit_X.sample(frac=0.05, random_state=13)) customize(classifier, **kwargs) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame( pipeline.predict_proba(audit_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
def test_fit_float(self): domain = ContinuousDomain(missing_value_treatment = "as_value", missing_value_replacement = -1.0, invalid_value_treatment = "as_is") self.assertEqual("as_value", domain.missing_value_treatment) self.assertEqual(-1.0, domain.missing_value_replacement) self.assertEqual("as_is", domain.invalid_value_treatment) self.assertFalse(hasattr(domain, "data_min_")) self.assertFalse(hasattr(domain, "data_max_")) self.assertFalse(hasattr(domain, "counts_")) self.assertFalse(hasattr(domain, "numeric_info_")) X = DataFrame(numpy.array([1.0, float('NaN'), 3.0, 2.0, float('NaN'), 2.0])) Xt = domain.fit_transform(X) self.assertEqual(1.0, domain.data_min_) self.assertEqual(3.0, domain.data_max_) self.assertEqual({"totalFreq" : 6, "missingFreq" : 2, "invalidFreq" : 0}, domain.counts_) self.assertEqual({"minimum" : [1.0], "maximum" : [3.0], "mean" : [2.0], "standardDeviation" : [0.7071067811865476], "median" : [2.0], "interQuartileRange" : [0.5]}, _array_to_list(domain.numeric_info_)) self.assertEqual(numpy.array([1.0, -1.0, 3.0, 2.0, -1.0, 2.0]).tolist(), Xt[0].tolist()) X = numpy.array([float('NaN'), None]) Xt = domain.transform(X) self.assertEqual(numpy.array([-1.0, -1.0]).tolist(), Xt.tolist())
def build_audit_na(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, predict_transformer = None, predict_proba_transformer = None, apply_transformer = None, **pmml_options): employment_mapping = { "CONSULTANT" : "PRIVATE", "PSFEDERAL" : "PUBLIC", "PSLOCAL" : "PUBLIC", "PSSTATE" : "PUBLIC", "SELFEMP" : "PRIVATE", "PRIVATE" : "PRIVATE" } gender_mapping = { "FEMALE" : 0.0, "MALE" : 1.0, "MISSING_VALUE" : 0.5 } mapper = DataFrameMapper( [(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999", dtype = int), name = "flag_missing(Age, -999)"), SimpleImputer(missing_values = -999, strategy = "constant", fill_value = 38)])] + [(["Age"], MissingIndicator())] + [(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), SimpleImputer(missing_values = -999, add_indicator = True)])] + [(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), SimpleImputer(strategy = "median", add_indicator = True)])] + [(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] + [([column], [CategoricalDomain(missing_values = None, missing_value_replacement = "N/A", with_data = False), SimpleImputer(missing_values = "N/A", strategy = "most_frequent"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), SimpleImputer(strategy = "constant"), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ], predict_transformer = predict_transformer, predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer) pipeline.fit(audit_na_X, audit_na_y, **fit_params) pipeline.configure(**pmml_options) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1) store_csv(adjusted, name)
def make_fit_lgbmlr(gbdt, lr): mapper = DataFrameMapper( [([cat_column], [CategoricalDomain(), LabelEncoder()]) for cat_column in cat_columns] + [(cont_columns, ContinuousDomain())]) classifier = GBDTLRClassifier(gbdt, lr) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(df[cat_columns + cont_columns], df[label_column], classifier__gbdt__categorical_feature=range( 0, len(cat_columns))) return pipeline
def feature_union(category_feature,numeric_feature): mapper_category = DataFrameMapper(gen_features( columns = category_feature, # LabelEncoder classes = [CategoricalDomain,CategoricalImputer,LabelEncoder] )) mapper_numerical = DataFrameMapper([ (numeric_feature,[ContinuousDomain(),SimpleImputer(strategy='mean'),StandardScaler()]) ]) pipeline_transformer = FeatureUnion([('mapper_category',mapper_category),\ ('mapper_numerical',mapper_numerical)]) return pipeline_transformer
def test_fit_transform(self): domain = MultiDomain([ContinuousDomain(missing_value_replacement = 0.0), CategoricalDomain(missing_value_replacement = "zero")]) X = DataFrame([[-1.0, "minus one"], [float("NaN"), None], [1.0, "one"]], columns = ["x1", "x2"]) Xt = domain.fit_transform(X) self.assertTrue(isinstance(Xt, DataFrame)) self.assertEqual([-1.0, 0.0, 1.0], Xt["x1"].tolist()) self.assertEqual(["minus one", "zero", "one"], Xt["x2"].tolist()) X = numpy.array([[float("NaN"), None]]) Xt = domain.transform(X) self.assertTrue(isinstance(Xt, numpy.ndarray)) self.assertTrue([0.0], Xt[:, 0].tolist()) self.assertTrue(["zero"], Xt[:, 1].tolist())
def test_mapper(self): domain = ContinuousDomain() df = DataFrame([{"X1" : 2.0, "X2" : 2, "y" : 2.0}, {"X1" : 1.0, "X2" : 0.5}, {"X1" : 2}, {"X2" : 2}, {"X1" : 2.0, "y" : 1}, {"X1" : 3.0, "X2" : 3.5}]) mapper = DataFrameMapper([ (["X1", "X2"], [domain, SimpleImputer(), StandardScaler()]), ("y", None) ]) mapper.fit_transform(df) self.assertEqual({"totalFreq" : [6, 6], "missingFreq" : [1, 2], "invalidFreq" : [0, 0]}, _array_to_list(domain.counts_)) self.assertEqual({"minimum" : [1.0, 0.5], "maximum" : [3.0, 3.5], "mean" : [2.0, 2.0]}, _array_to_list(dict((k, domain.numeric_info_[k]) for k in ["minimum", "maximum", "mean"]))) self.assertEqual([1.0, 0.5], domain.data_min_.tolist()) self.assertEqual([3.0, 3.5], domain.data_max_.tolist())