def build_housing(regressor, name, with_kneighbors = False, **pmml_options): mapper = DataFrameMapper([ (housing_X.columns.values, ContinuousDomain()) ]) pipeline = Pipeline([ ("mapper", mapper), ("transformer-pipeline", Pipeline([ ("polynomial", PolynomialFeatures(degree = 2, interaction_only = True, include_bias = False)), ("scaler", StandardScaler()), ("passthrough-transformer", "passthrough"), ("selector", SelectPercentile(score_func = f_regression, percentile = 35)), ("passthrough-final-estimator", "passthrough") ])), ("regressor", regressor) ]) pipeline.fit(housing_X, housing_y) pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values, housing_y.name) pipeline.configure(**pmml_options) pipeline.verify(housing_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) medv = DataFrame(pipeline.predict(housing_X), columns = ["MEDV"]) if with_kneighbors == True: Xt = pipeline_transform(pipeline, housing_X) kneighbors = regressor.kneighbors(Xt) medv_ids = DataFrame(kneighbors[1] + 1, columns = ["neighbor(" + str(x + 1) + ")" for x in range(regressor.n_neighbors)]) medv = pandas.concat((medv, medv_ids), axis = 1) store_csv(medv, name)
def build_wheat(kmeans, name, with_affinity = True, **pmml_options): mapper = DataFrameMapper([ (wheat_X.columns.values, [ContinuousDomain(dtype = float), IdentityTransformer()]) ]) scaler = ColumnTransformer([ ("robust", RobustScaler(), [0, 5]) ], remainder = MinMaxScaler()) pipeline = Pipeline([ ("mapper", mapper), ("scaler", scaler), ("clusterer", kmeans) ]) pipeline.fit(wheat_X) pipeline = make_pmml_pipeline(pipeline, wheat_X.columns.values) pipeline.configure(**pmml_options) store_pkl(pipeline, name) cluster = DataFrame(pipeline.predict(wheat_X), columns = ["Cluster"]) if with_affinity == True: Xt = pipeline_transform(pipeline, wheat_X) affinity_0 = kmeans_distance(kmeans, 0, Xt) affinity_1 = kmeans_distance(kmeans, 1, Xt) affinity_2 = kmeans_distance(kmeans, 2, Xt) cluster_affinity = DataFrame(numpy.transpose([affinity_0, affinity_1, affinity_2]), columns = ["affinity(0)", "affinity(1)", "affinity(2)"]) cluster = pandas.concat((cluster, cluster_affinity), axis = 1) store_csv(cluster, name)
def build_iris(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, **pmml_options): pipeline = Pipeline([ ("pipeline", Pipeline([ ("mapper", DataFrameMapper([ (iris_X.columns.values, ContinuousDomain()), (["Sepal.Length", "Petal.Length"], Aggregator(function = "mean")), (["Sepal.Width", "Petal.Width"], Aggregator(function = "mean")) ])), ("transform", FeatureUnion([ ("normal_scale", FunctionTransformer(None, validate = True)), ("log_scale", FunctionTransformer(numpy.log10, validate = True)), ("power_scale", PowerFunctionTransformer(power = 2)) ])) ])), ("pca", IncrementalPCA(n_components = 3, whiten = True)), ("classifier", classifier) ]) pipeline.fit(iris_X, iris_y, **fit_params) pipeline = make_pmml_pipeline(pipeline, iris_X.columns.values, iris_y.name) pipeline.configure(**pmml_options) if isinstance(classifier, XGBClassifier): pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params) store_pkl(pipeline, name) species = DataFrame(pipeline.predict(iris_X, **predict_params), columns = ["Species"]) if with_proba == True: species_proba = DataFrame(pipeline.predict_proba(iris_X, **predict_proba_params), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"]) species = pandas.concat((species, species_proba), axis = 1) store_csv(species, name)
def build_versicolor(classifier, name, with_proba = True, **pmml_options): transformer = ColumnTransformer([ ("continuous_columns", Pipeline([ ("domain", ContinuousDomain()), ("scaler", RobustScaler()) ]), versicolor_X.columns.values) ]) pipeline = Pipeline([ ("transformer", transformer), ("transformer-selector-pipeline", Pipeline([ ("polynomial", PolynomialFeatures(degree = 3)), ("selector", SelectKBest(k = "all")) ])), ("classifier", classifier) ]) pipeline.fit(versicolor_X, versicolor_y) pipeline = make_pmml_pipeline(pipeline, versicolor_X.columns.values, versicolor_y.name) pipeline.configure(**pmml_options) pipeline.verify(versicolor_X.sample(frac = 0.10, random_state = 13)) store_pkl(pipeline, name) species = DataFrame(pipeline.predict(versicolor_X), columns = ["Species"]) if with_proba == True: species_proba = DataFrame(pipeline.predict_proba(versicolor_X), columns = ["probability(0)", "probability(1)"]) species = pandas.concat((species, species_proba), axis = 1) store_csv(species, name)
def build_audit(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, **pmml_options): continuous_mapper = DataFrameMapper([ (["Age", "Income", "Hours"], MultiDomain([ContinuousDomain() for i in range(0, 3)])) ]) categorical_mapper = DataFrameMapper([ (["Employment"], [CategoricalDomain(), SubstringTransformer(0, 3), OneHotEncoder(drop = ["Vol"]), SelectFromModel(DecisionTreeClassifier(random_state = 13))]), (["Education"], [CategoricalDomain(), ReplaceTransformer("[aeiou]", ""), OneHotEncoder(drop = "first"), SelectFromModel(RandomForestClassifier(n_estimators = 3, random_state = 13), threshold = "1.25 * mean")]), (["Marital"], [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]), (["Occupation"], [CategoricalDomain(), LabelBinarizer(), SelectKBest(k = 3)]), (["Gender"], [CategoricalDomain(), MatchesTransformer("^Male$"), CastTransformer(int)]), (["Deductions"], [CategoricalDomain()]), ]) pipeline = Pipeline([ ("union", FeatureUnion([ ("continuous", continuous_mapper), ("categorical", Pipeline([ ("mapper", categorical_mapper), ("polynomial", PolynomialFeatures()) ])) ])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y, **fit_params) pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name) pipeline.configure(**pmml_options) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X, **predict_params), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def build_auto(regressor, name, **pmml_options): cylinders_origin_mapping = { (8, 1) : "8/1", (6, 1) : "6/1", (4, 1) : "4/1", (6, 2) : "6/2", (4, 2) : "4/2", (6, 3) : "6/3", (4, 3) : "4/3" } mapper = DataFrameMapper([ (["cylinders", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), LabelBinarizer()]), (["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]), (["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {"alias" : "weight / displacement + 0.5"}) ]) pipeline = Pipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y) pipeline = make_pmml_pipeline(pipeline, auto_X.columns.values, auto_y.name) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name + ".csv")
def build_iforest_housing(iforest, name, **pmml_options): mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())]) pipeline = Pipeline([("mapper", mapper), ("estimator", iforest)]) pipeline.fit(housing_X) pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values) pipeline.configure(**pmml_options) store_pkl(pipeline, name) decisionFunction = DataFrame(pipeline.decision_function(housing_X), columns=["decisionFunction"]) outlier = DataFrame(pipeline.predict(housing_X) == -1, columns=["outlier" ]).replace(True, "true").replace(False, "false") store_csv(pandas.concat([decisionFunction, outlier], axis=1), name)