def build_auto(regressor, name, **pmml_options): cat_columns = ["cylinders", "model_year", "origin"] cont_columns = ["displacement", "horsepower", "weight", "acceleration"] if isinstance(regressor, LGBMRegressor): cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns] else: cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns] cont_mappings = [([cont_column], [cont_domain(name)]) for cont_column in cont_columns] mapper = DataFrameMapper(cat_mappings + cont_mappings) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) if isinstance(regressor, LGBMRegressor): pipeline.fit(auto_X, auto_y, regressor__categorical_feature = [0, 1, 2]) elif isinstance(regressor, IsolationForest): pipeline.fit(auto_X) else: pipeline.fit(auto_X, auto_y) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(n = 3, random_state = 13)) pipeline.configure(**pmml_options) store_pmml(pipeline, name) if isinstance(regressor, IsolationForest): decision_function = DataFrame(pipeline.decision_function(auto_X), columns = ["decisionFunction"]) outlier = DataFrame(pipeline.predict(auto_X), columns = ["outlier"]) outlier['outlier'] = outlier['outlier'].apply(lambda x: str(bool(x == -1)).lower()) store_csv(pandas.concat((decision_function, outlier), axis = 1), name) else: mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name)
def submodel_evaluation(train_data,valid_data,model_list,\ category_feature,numeric_feature): X_train = train_data[category_feature+numeric_feature] y_train = train_data['user_type'] X_valid = valid_data[category_feature+numeric_feature] y_valid = valid_data['user_type'] pipeline_transformer = feature_union(category_feature,numeric_feature) model_result_dict = {} for model in model_list: model_name = model.__class__.__name__ print('model %s evaluation'%model_name) sub_model = PMMLPipeline([ ('mapper',pipeline_transformer), ('classifier',model) ]) sub_model.fit(X_train,y_train) predict_valid = sub_model.predict_proba(X_valid)[:,1] predict_label = sub_model.predict(X_valid) model_ks = plot_ks_curve(predict_valid,valid_data['user_type']) model_auc = roc_auc_score(y_valid, predict_valid) accuracy = metrics.accuracy_score(y_valid,predict_label) model_result_dict[model_name] = [model_ks,model_auc,accuracy] return model_result_dict
def build_auto_na(regressor, name, predict_transformer = None, apply_transformer = None, **pmml_options): mapper = DataFrameMapper( [([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year"]] + [(["origin"], [CategoricalDomain(missing_values = -1), SimpleImputer(missing_values = -1, strategy = "most_frequent"), OneHotEncoder()])] + [(["acceleration"], [ContinuousDomain(missing_values = None), CutTransformer(bins = [5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25], labels = False), CategoricalImputer(), LabelBinarizer()])] + [(["displacement"], [ContinuousDomain(missing_values = None), SimpleImputer(), CutTransformer(bins = [0, 100, 200, 300, 400, 500], labels = ["XS", "S", "M", "L", "XL"]), LabelBinarizer()])] + [(["horsepower"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 50, high_value = 225), SimpleImputer(strategy = "median")])] + [(["weight"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 2000, high_value = 5000), SimpleImputer(strategy = "median")])] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ], predict_transformer = predict_transformer, apply_transformer = apply_transformer) pipeline.fit(auto_na_X, auto_na_y) if isinstance(regressor, DecisionTreeRegressor): tree = regressor.tree_ node_impurity = {node_idx : tree.impurity[node_idx] for node_idx in range(0, tree.node_count) if tree.impurity[node_idx] != 0.0} pmml_options["node_extensions"] = {regressor.criterion : node_impurity} pipeline.configure(**pmml_options) pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"]) if isinstance(regressor, DecisionTreeRegressor): Xt = pipeline_transform(pipeline, auto_na_X) mpg_apply = DataFrame(regressor.apply(Xt), columns = ["nodeId"]) mpg = pandas.concat((mpg, mpg_apply), axis = 1) store_csv(mpg, name)
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options): cylinders_origin_mapping = { (8, 1) : "8/1", (6, 1) : "6/1", (4, 1) : "4/1", (6, 2) : "6/2", (4, 2) : "4/2", (4, 3) : "4/3" } mapper = DataFrameMapper([ (["cylinders"], [CategoricalDomain(), Alias(ExpressionTransformer("X[0] % 2.0 > 0.0", dtype = numpy.int8), name = "odd(cylinders)", prefit = True)]), (["cylinders", "origin"], [MultiDomain([None, CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), OneHotEncoder()]), (["model_year"], [CategoricalDomain(), CastTransformer(str), ExpressionTransformer("'19' + X[0] + '-01-01'"), CastTransformer("datetime64[D]"), DaysSinceYearTransformer(1977), Binarizer(threshold = 0)], {"alias" : "bin(model_year, 1977)"}), (["model_year", "origin"], [ConcatTransformer("/"), OneHotEncoder(sparse = False), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]), (["weight", "displacement"], [ContinuousDomain(), ExpressionTransformer("(X[0] / X[1]) + 0.5", dtype = numpy.float64)], {"alias" : "weight / displacement + 0.5"}), (["displacement", "horsepower", "weight", "acceleration"], [MultiDomain([None, ContinuousDomain(), None, ContinuousDomain()]), StandardScaler()]) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y, **fit_params) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"]) store_csv(mpg, name)
def build_iris(classifier, name, **pmml_options): cont_columns = [ "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width" ] cont_mappings = [([cont_column], ContinuousDomain()) for cont_column in cont_columns] mapper = DataFrameMapper(cont_mappings) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(iris_X, iris_y) if isinstance(classifier, XGBClassifier): pipeline.verify(iris_X.sample(n=3, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(iris_X.sample(n=3, random_state=13)) pipeline.configure(**pmml_options) store_pmml(pipeline, name) species = DataFrame(pipeline.predict(iris_X), columns=["Species"]) species_proba = DataFrame(pipeline.predict_proba(iris_X), columns=[ "probability(setosa)", "probability(versicolor)", "probability(virginica)" ]) store_csv(pandas.concat((species, species_proba), axis=1), name)
def build_auto(regressor, name, **pmml_options): cylinders_origin_mapping = { (8, 1) : "8/1", (6, 1) : "6/1", (4, 1) : "4/1", (6, 2) : "6/2", (4, 2) : "4/2", (6, 3) : "6/3", (4, 3) : "4/3" } mapper = DataFrameMapper([ (["cylinders", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), LabelBinarizer()]), (["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]), (["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {"alias" : "weight / displacement + 0.5"}) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name)
def build_sentiment(classifier, name, with_proba=True, **pmml_options): pipeline = PMMLPipeline([ ("tf-idf", TfidfVectorizer( analyzer="word", preprocessor=None, strip_accents=None, lowercase=True, token_pattern=None, tokenizer=Splitter(), stop_words="english", ngram_range=(1, 2), norm=None, dtype=(numpy.float32 if isinstance( classifier, RandomForestClassifier) else numpy.float64))), ("selector", SelectKBest(f_classif, k=500)), ("classifier", classifier) ]) pipeline.fit(sentiment_X, sentiment_y) pipeline.configure(**pmml_options) store_pkl(pipeline, name + ".pkl") score = DataFrame(pipeline.predict(sentiment_X), columns=["Score"]) if with_proba == True: score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns=["probability(0)", "probability(1)"]) score = pandas.concat((score, score_proba), axis=1) store_csv(score, name + ".csv")
def build_auto_h2o(regressor, name): transformer = ColumnTransformer( [(column, CategoricalDomain(), [column]) for column in ["cylinders", "model_year", "origin"]] + [(column, ContinuousDomain(), [column]) for column in ["displacement", "horsepower", "weight", "acceleration"]]) pipeline = PMMLPipeline([("transformer", transformer), ("uploader", H2OFrameCreator(column_names=[ "cylinders", "model_year", "origin", "displacement", "horsepower", "weight", "acceleration" ], column_types=[ "enum", "enum", "enum", "numeric", "numeric", "numeric", "numeric" ])), ("regressor", regressor)]) pipeline.fit(auto_X, H2OFrame(auto_y.to_frame())) pipeline.verify(auto_X.sample(frac=0.05, random_state=13)) regressor = pipeline._final_estimator store_mojo(regressor, name + ".zip") store_pkl(pipeline, name + ".pkl") mpg = pipeline.predict(auto_X) mpg.set_names(["mpg"]) store_csv(mpg.as_data_frame(), name + ".csv")
def build_audit(classifier, name, **pmml_options): if isinstance(classifier, LGBMClassifier): cat_columns = ["Age", "Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"] cont_columns = ["Income", "Hours"] else: cat_columns = ["Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"] cont_columns = ["Age", "Income", "Hours"] if isinstance(classifier, LGBMClassifier): cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns] else: cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns] cont_mappings = [([cont_column], cont_domain(name)) for cont_column in cont_columns] mapper = DataFrameMapper(cat_mappings + cont_mappings) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ]) if isinstance(classifier, LGBMClassifier): pipeline.fit(audit_X, audit_y, classifier__categorical_feature = [0, 1, 2, 3, 4, 5]) elif isinstance(classifier, XGBClassifier): if name == "XGBoostAuditNA": audit_X["Age"] = audit_X["Age"].astype(float) pipeline.fit(audit_X, audit_y) else: pipeline.fit(audit_X, audit_y) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_X.sample(n = 3, random_state = 13)) pipeline.configure(**pmml_options) store_pmml(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"]) adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"]) store_csv(pandas.concat((adjusted, adjusted_proba), axis = 1), name)
def build_iris_vec(classifier, name): pipeline = PMMLPipeline([ ("classifier", classifier) ]) pipeline.fit(iris_X, iris_y) store_pmml(pipeline, name) species = DataFrame(pipeline.predict(iris_X), columns = ["Species"]) species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"]) store_csv(pandas.concat((species, species_proba), axis = 1), name)
def build_auto_isotonic(regressor, auto_isotonic_X, name): pipeline = PMMLPipeline([ ("regressor", regressor) ]) pipeline.fit(auto_isotonic_X, auto_y) pipeline.verify(auto_isotonic_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_isotonic_X), columns = ["mpg"]) store_csv(mpg, name)
def build_audit(mapper, classifier, name, **pmml_options): pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_X, audit_y) pipeline.configure(**pmml_options) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"]) adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name)
def build_apollo(mapper, name): pipeline = PMMLPipeline([("mapper", mapper), ("classifier", DecisionTreeClassifier())]) pipeline.fit(df, df["success"]) store_pkl(pipeline, name) success = DataFrame(pipeline.predict(df), columns=["success"]) success_proba = DataFrame( pipeline.predict_proba(df), columns=["probability(false)", "probability(true)"]) success = pandas.concat((success, success_proba), axis=1) store_csv(success, name)
def build_audit_dict(classifier, name, with_proba = True): pipeline = PMMLPipeline([ ("dict-transformer", DictVectorizer()), ("classifier", classifier) ]) pipeline.fit(audit_dict_X, audit_y) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_dict_X), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_dict_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def build_auto_opt(regressor, name, fit_params = {}, **pmml_options): pipeline = PMMLPipeline([ ("regressor", regressor) ]) pipeline.fit(auto_X[auto_train_mask], auto_y[auto_train_mask], **fit_params) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name)
def test_predict_transform(self): predict_transformer = FeatureUnion([ ("identity", FunctionTransformer(None)), ("log10", FunctionTransformer(numpy.log10)) ]) pipeline = PMMLPipeline([("estimator", DummyRegressor())], predict_transformer = predict_transformer) X = DataFrame([[1, 0], [2, 0], [3, 0]], columns = ["X1", "X2"]) y = Series([0.5, 1.0, 1.5], name = "y") pipeline.fit(X, y) y_pred = [1.0, 1.0, 1.0] y_predt = [1.0, 1.0, numpy.log10(1.0)] self.assertEqual(y_pred, pipeline.predict(X).tolist()) self.assertEqual([y_predt for i in range(0, 3)], pipeline.predict_transform(X).tolist())
def build_auto_na_hist(regressor, name): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["displacement", "horsepower", "weight", "acceleration"]] + [([column], [CategoricalDomain(), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year", "origin"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(auto_na_X, auto_na_y) pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"]) store_csv(mpg, name)
def build_iris_opt(classifier, name, fit_params = {}, **pmml_options): pipeline = PMMLPipeline([ ("classifier", classifier) ]) pipeline.fit(iris_X[iris_train_mask], iris_y[iris_train_mask], **fit_params) if isinstance(classifier, XGBClassifier): pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13)) store_pkl(pipeline, name) species = DataFrame(pipeline.predict(iris_X), columns = ["Species"]) species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"]) species = pandas.concat((species, species_proba), axis = 1) store_csv(species, name)
def build_sentiment(classifier, transformer, name, with_proba = True, **pmml_options): pipeline = PMMLPipeline([ ("transformer", transformer), ("densifier", DenseTransformer()), ("selector", SelectKBest(f_classif, k = 500)), ("classifier", classifier) ]) pipeline.fit(sentiment_X, sentiment_y) pipeline.configure(**pmml_options) store_pmml(pipeline, name) score = DataFrame(pipeline.predict(sentiment_X), columns = ["Score"]) if with_proba: score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns = ["probability(0)", "probability(1)"]) score = pandas.concat((score, score_proba), axis = 1) store_csv(score, name)
def build_auto_na(regressor, name): mapper = DataFrameMapper( [([column], [ CategoricalDomain(missing_values=-1), CategoricalImputer(missing_values=-1), PMMLLabelBinarizer() ]) for column in ["cylinders", "model_year"]] + [(["origin"], [CategoricalImputer(missing_values=-1), OneHotEncoder()])] + [(["acceleration"], [ ContinuousDomain(missing_values=None), CutTransformer(bins=[5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25], labels=False), CategoricalImputer(), LabelBinarizer() ])] + [(["displacement"], [ ContinuousDomain(missing_values=None), Imputer(), CutTransformer(bins=[0, 100, 200, 300, 400, 500], labels=["XS", "S", "M", "L", "XL"]), LabelBinarizer() ])] + [(["horsepower"], [ ContinuousDomain(missing_values=None, outlier_treatment="as_extreme_values", low_value=50, high_value=225), Imputer() ])] + [(["weight"], [ ContinuousDomain(missing_values=None, outlier_treatment="as_extreme_values", low_value=2000, high_value=5000), Imputer() ])]) pipeline = PMMLPipeline([("mapper", mapper), ("regressor", regressor)]) pipeline.fit(auto_na_X, auto_na_y) if isinstance(regressor, DecisionTreeRegressor): tree = regressor.tree_ node_impurity = { node_idx: tree.impurity[node_idx] for node_idx in range(0, tree.node_count) if tree.impurity[node_idx] != 0.0 } pipeline.configure( node_extensions={regressor.criterion: node_impurity}) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_na_X), columns=["mpg"]) store_csv(mpg, name + ".csv")
def build_visit(regressor, name): mapper = DataFrameMapper( [(["edlevel"], [CategoricalDomain(), OneHotEncoder()])] + [([bin_column], [CategoricalDomain(), OneHotEncoder()]) for bin_column in ["outwork", "female", "married", "kids", "self"]] + [(["age"], ContinuousDomain())] + [(["hhninc", "educ"], ContinuousDomain())] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(visit_X, visit_y) pipeline.verify(visit_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) docvis = DataFrame(pipeline.predict(visit_X), columns = ["docvis"]) store_csv(docvis, name)
def build_audit_na_direct(classifier, name): mapper = DataFrameMapper([ (["Age", "Hours", "Income"], None), (["Employment", "Education", "Marital", "Occupation", "Gender"], OneHotEncoder()) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ]) pipeline.fit(audit_na_X, audit_na_y) pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"]) adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def build_audit_na_hist(classifier, name): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] + [([column], [CategoricalDomain(), PMMLLabelBinarizer()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ]) pipeline.fit(audit_na_X, audit_na_y) pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"]) adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def build_versicolor_direct(classifier, name, with_proba=True, **pmml_options): transformer = ColumnTransformer( [("all", "passthrough", ["Petal.Length", "Petal.Width"])], remainder="drop") pipeline = PMMLPipeline([("transformer", transformer), ("classifier", classifier)]) pipeline.fit(versicolor_X, versicolor_y) pipeline.configure(**pmml_options) pipeline.verify(versicolor_X.sample(frac=0.10, random_state=13)) store_pkl(pipeline, name) species = DataFrame(pipeline.predict(versicolor_X), columns=["Species"]) if with_proba == True: species_proba = DataFrame(pipeline.predict_proba(versicolor_X), columns=["probability(0)", "probability(1)"]) species = pandas.concat((species, species_proba), axis=1) store_csv(species, name)
def build_audit_dict(classifier, name, with_proba = True): header = { "copyright" : "Copyright (c) 2021 Villu Ruusmann", "description" : "Integration test for dictionary (key-value mappings) input", "modelVersion" : "1.0.0" } pipeline = PMMLPipeline([ ("dict-transformer", DictVectorizer()), ("classifier", classifier) ], header = header) pipeline.fit(audit_dict_X, audit_y) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_dict_X), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_dict_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def build_sentiment(classifier, tokenizer, name, with_proba = True, **pmml_options): pipeline = PMMLPipeline([ ("union", FeatureUnion([ ("tf-idf", TfidfVectorizer(analyzer = "word", preprocessor = None, strip_accents = None, lowercase = True, tokenizer = tokenizer, stop_words = "english", ngram_range = (1, 2), norm = None, sublinear_tf = isinstance(classifier, LogisticRegressionCV), dtype = (numpy.float32 if isinstance(classifier, RandomForestClassifier) else numpy.float64))), ("count", WordCountTransformer()) ])), ("selector", SelectKBest(f_classif, k = 1000)), ("classifier", classifier) ]) pipeline.fit(sentiment_X, sentiment_y) pipeline.configure(**pmml_options) store_pkl(pipeline, name) score = DataFrame(pipeline.predict(sentiment_X), columns = ["Score"]) if with_proba == True: score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns = ["probability(0)", "probability(1)"]) score = pandas.concat((score, score_proba), axis = 1) store_csv(score, name)
def build_audit_h2o(classifier, name): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] + [([column], CategoricalDomain()) for column in ["Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("uploader", H2OFrameCreator()), ("classifier", classifier) ]) pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(), column_types = ["categorical"])) pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13)) classifier = pipeline._final_estimator store_mojo(classifier, name) store_pkl(pipeline, name) adjusted = pipeline.predict(audit_X) adjusted.set_names(["h2o(Adjusted)", "probability(0)", "probability(1)"]) store_csv(adjusted.as_data_frame(), name)
def build_audit_na(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, predict_transformer = None, predict_proba_transformer = None, apply_transformer = None, **pmml_options): employment_mapping = { "CONSULTANT" : "PRIVATE", "PSFEDERAL" : "PUBLIC", "PSLOCAL" : "PUBLIC", "PSSTATE" : "PUBLIC", "SELFEMP" : "PRIVATE", "PRIVATE" : "PRIVATE" } gender_mapping = { "FEMALE" : 0.0, "MALE" : 1.0, "MISSING_VALUE" : 0.5 } mapper = DataFrameMapper( [(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999", dtype = int), name = "flag_missing(Age, -999)"), SimpleImputer(missing_values = -999, strategy = "constant", fill_value = 38)])] + [(["Age"], MissingIndicator())] + [(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), SimpleImputer(missing_values = -999, add_indicator = True)])] + [(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), SimpleImputer(strategy = "median", add_indicator = True)])] + [(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] + [([column], [CategoricalDomain(missing_values = None, missing_value_replacement = "N/A", with_data = False), SimpleImputer(missing_values = "N/A", strategy = "most_frequent"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), SimpleImputer(strategy = "constant"), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ], predict_transformer = predict_transformer, predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer) pipeline.fit(audit_na_X, audit_na_y, **fit_params) pipeline.configure(**pmml_options) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1) store_csv(adjusted, name)
def build_audit_na(classifier, name, with_proba = True, predict_proba_transformer = None, apply_transformer = None, **pmml_options): employment_mapping = { "CONSULTANT" : "PRIVATE", "PSFEDERAL" : "PUBLIC", "PSLOCAL" : "PUBLIC", "PSSTATE" : "PUBLIC", "SELFEMP" : "PRIVATE", "PRIVATE" : "PRIVATE" } gender_mapping = { "FEMALE" : 0, "MALE" : 1 } mapper = DataFrameMapper( [(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999"), name = "flag_missing(Age, -999)"), Imputer(missing_values = -999)])] + [(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), Imputer(missing_values = -999)])] + [(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), Imputer()])] + [(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] + [([column], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ], predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer) pipeline.fit(audit_na_X, audit_na_y) pipeline.configure(**pmml_options) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1) store_csv(adjusted, name + ".csv")
SimpleImputer(strategy="median")), ("scaler", StandardScaler())]) categorical_features = ["age", "gender", "category"] categorical_transformer = OneHotEncoder(handle_unknown="ignore") preprocessor = ColumnTransformer(transformers=[("numeric", numeric_transformer, numeric_features), ("categorical", categorical_transformer, categorical_features)]) lr = LogisticRegression(multi_class="ovr") lr.pmml_name_ = f"PMML-FraudDetection-{model_id}" classifier = PMMLPipeline([("preprocessor", preprocessor), ("classifier", lr)]) # Build model classifier.fit(train_x, train_y) # Evaluate model test_actual = test_y test_predicted = classifier.predict(test_x) (rmse, mae, r2) = evaluate_metrics(test_actual, test_predicted) print("FraudDetection model:") print(" RMSE: {}".format(rmse)) print(" MAE: {}".format(mae)) print(" R2: {}".format(r2)) sklearn2pmml(classifier, output_path, with_repr=True) print("Fraud detection model exported")