import pandas as pd from sklearn.preprocessing import OneHotEncoder from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LinearRegression from sklearn2pmml import sklearn2pmml from sklearn2pmml.decoration import ContinuousDomain, CategoricalDomain from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml.feature_extraction.text import Splitter from sklearn_pandas import DataFrameMapper data = pd.read_csv("test/support/mpg.csv") numeric_features = ["displ", "year", "cyl"] categorical_features = ["drv", "class"] text_features = ["model"] mapper = DataFrameMapper( [(numeric_features, [ContinuousDomain()])] + [([f], [CategoricalDomain(), OneHotEncoder()]) for f in categorical_features] + [(f, [ CategoricalDomain(), CountVectorizer(tokenizer=Splitter(), max_features=5) ]) for f in text_features]) pipeline = PMMLPipeline([("mapper", mapper), ("model", LinearRegression())]) pipeline.fit(data, data["hwy"]) sklearn2pmml(pipeline, "test/support/python/linear_regression_text.pmml") print(list(pipeline.predict(data[:10])))
species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"]) species = pandas.concat((species, species_proba), axis = 1) store_csv(species, name) if "Iris" in datasets: build_iris_opt(LGBMClassifier(objective = "multiclass"), "LGBMIris", fit_params = {"classifier__eval_set" : [(iris_X[iris_test_mask], iris_y[iris_test_mask])], "classifier__eval_metric" : "multi_logloss", "classifier__early_stopping_rounds" : 3}) build_iris_opt(XGBClassifier(objective = "multi:softprob"), "XGBIris", fit_params = {"classifier__eval_set" : [(iris_X[iris_test_mask], iris_y[iris_test_mask])], "classifier__eval_metric" : "mlogloss", "classifier__early_stopping_rounds" : 3}) if "Iris" in datasets: pipeline = PMMLPipeline([ ("mapper", DataFrameMapper([ (iris_X.columns.values, ContinuousDomain()) ])), ("classifier", SelectFirstClassifier([ ("select", Pipeline([ ("classifier", DecisionTreeClassifier(random_state = 13)) ]), "X[1] <= 3"), ("default", Pipeline([ ("scaler", StandardScaler()), ("classifier", LogisticRegression(multi_class = "ovr", solver = "liblinear")) ]), str(True)) ])) ]) pipeline.fit(iris_X, iris_y) store_pkl(pipeline, "SelectFirstIris") species = DataFrame(pipeline.predict(iris_X), columns = ["Species"]) species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"]) species = pandas.concat((species, species_proba), axis = 1) store_csv(species, "SelectFirstIris") if "Iris" in datasets: classifier = RuleSetClassifier([
build_iris_opt(LGBMClassifier(objective = "multiclass"), "LGBMIris", fit_params = {"classifier__eval_set" : [(iris_X[iris_test_mask], iris_y[iris_test_mask])], "classifier__eval_metric" : "multi_logloss", "classifier__early_stopping_rounds" : 3}) build_iris_opt(XGBClassifier(objective = "multi:softprob"), "XGBIris", fit_params = {"classifier__eval_set" : [(iris_X[iris_test_mask], iris_y[iris_test_mask])], "classifier__eval_metric" : "mlogloss", "classifier__early_stopping_rounds" : 3}) if "Iris" in datasets: mapper = DataFrameMapper([ (iris_X.columns.values, ContinuousDomain()) ]) iris_Xt = mapper.fit_transform(iris_X) dt_classifier = DecisionTreeClassifier(random_state = 13) dt_classifier.fit(iris_Xt, iris_y) lr_classifier = LogisticRegression(multi_class = "ovr", solver = "liblinear") lr_classifier.fit(iris_Xt, iris_y) pipeline = PMMLPipeline([ ("mapper", mapper), ("estimator", SelectFirstEstimator([ ("X[2] <= 3", dt_classifier), (str(True), lr_classifier) ])) ]) pipeline.active_fields = iris_X.columns.values pipeline.target_fields = ["Species"] store_pkl(pipeline, "SelectFirstIris") species = DataFrame(pipeline.predict(iris_X), columns = ["Species"]) store_csv(species, "SelectFirstIris") if "Iris" in datasets: classifier = RuleSetClassifier([ ("X['Petal.Length'] >= 2.45 and X['Petal.Width'] < 1.75", "versicolor"), ("X['Petal.Length'] >= 2.45", "virginica") ], default_score = "setosa") pipeline = PMMLPipeline([
labels=["q1", "q2", "q3", "q4"]), LabelBinarizer() ]), ("Hours", ContinuousDomain()), ("Income", ContinuousDomain()), (["Hours", "Income"], Alias(ExpressionTransformer("X[1] / (X[0] * 52)"), "Hourly_Income")) ]) interaction_mapper = DataFrameMapper([ ("Gender", [CategoricalDomain(), LabelBinarizer()]), ("Marital", [CategoricalDomain(), LabelBinarizer()]) ]) classifier = XGBClassifier() pipeline = PMMLPipeline([ ("mapper", FeatureUnion([("scalar_mapper", scalar_mapper), ("interaction", Pipeline([("interaction_mapper", interaction_mapper), ("polynomial", PolynomialFeatures())]))])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y) pipeline.configure(compact=True) pipeline.verify(audit_X.sample(100), zeroThreshold=1e-6, precision=1e-6) sklearn2pmml(pipeline, "pmml/XGBoostAudit.pmml") if "--deploy" in sys.argv: from openscoring import Openscoring os = Openscoring("http://localhost:8080/openscoring") os.deployFile("XGBoostAudit", "pmml/XGBoostAudit.pmml")
from sklearn2pmml import sklearn2pmml from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml.ruleset import RuleSetClassifier import pandas import sys iris_df = pandas.read_csv("csv/Iris.csv") #print(iris_df.head(5)) iris_X = iris_df[iris_df.columns.difference(["Species"])] iris_y = iris_df["Species"] classifier = RuleSetClassifier([ ("X['Petal_Length'] < 2.45", "setosa"), ("X['Petal_Width'] < 1.75", "versicolor"), ], default_score = "virginica") pipeline = PMMLPipeline([ ("classifier", classifier) ]) pipeline.fit(iris_X, iris_y) sklearn2pmml(pipeline, "pmml/RuleSetIris.pmml") if "--deploy" in sys.argv: from openscoring import Openscoring os = Openscoring("http://localhost:8080/openscoring") os.deployFile("RuleSetIris", "pmml/RuleSetIris.pmml")
from sklearn2pmml.preprocessing import PMMLLabelEncoder from sklearn2pmml.feature_extraction.text import Splitter from sklearn_pandas import DataFrameMapper binary = False data = pd.read_csv("test/support/mpg.csv") if binary: data["drv"] = data["drv"].replace("r", "4") numeric_features = ["displ", "year", "cyl"] categorical_features = ["class"] text_features = [] mapper = DataFrameMapper( [(numeric_features, [ContinuousDomain()])] + [([f], [CategoricalDomain(), PMMLLabelEncoder()]) for f in categorical_features] + [(f, [CategoricalDomain(), CountVectorizer(tokenizer=Splitter())]) for f in text_features]) pipeline = PMMLPipeline([("mapper", mapper), ("model", LGBMClassifier(n_estimators=1000))]) pipeline.fit(data, data["drv"], model__categorical_feature=[3]) suffix = "binary" if binary else "multiclass" sklearn2pmml(pipeline, "test/support/python/lightgbm_" + suffix + ".pmml") print(list(pipeline.predict(data[:10]))) print(list(pipeline.predict_proba(data[0:1])[0]))
("Income", ContinuousDomain()), (["Hours", "Income"], Alias(ExpressionTransformer("X[1] / (X[0] * 52)"), "Hourly_Income"))]) classifier = H2ORandomForestEstimator(ntrees=17) predict_proba_transformer = Pipeline([ ("expression", ExpressionTransformer("X[1]")), ("cut", Alias(CutTransformer(bins=[0.0, 0.75, 0.90, 1.0], labels=["no", "maybe", "yes"]), "Decision", prefit=True)) ]) pipeline = PMMLPipeline([("local_mapper", mapper), ("uploader", H2OFrameCreator()), ("remote_classifier", classifier)], predict_proba_transformer=predict_proba_transformer) pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(), column_types=["categorical"])) pipeline.verify(audit_X.sample(100)) sklearn2pmml(pipeline, "pmml/RandomForestAudit.pmml") if "--deploy" in sys.argv: from openscoring import Openscoring os = Openscoring("http://localhost:8080/openscoring") os.deployFile("RandomForestAudit", "pmml/RandomForestAudit.pmml")
''' @Author: Runsen @微信公众号: 润森笔记 @博客: https://blog.csdn.net/weixin_44510615 @Date: 2020/5/24 ''' from sklearn.datasets import load_iris from sklearn.decomposition import PCA from sklearn.svm import SVC from sklearn2pmml import sklearn2pmml from sklearn2pmml.pipeline import PMMLPipeline pipeline = PMMLPipeline([("pca", PCA(n_components=3)), ("classifier", SVC())]) iris = load_iris() pipeline.fit(iris.data, iris.target) sklearn2pmml(pipeline, "iris_SVC.pmml", with_repr=True)
import pandas iris_df = pandas.read_csv("Iris.csv") from sklearn_pandas import DataFrameMapper from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest from sklearn.preprocessing import Imputer from sklearn.linear_model import LogisticRegression from sklearn2pmml.decoration import ContinuousDomain from sklearn2pmml.pipeline import PMMLPipeline pipeline = PMMLPipeline([ ("mapper", DataFrameMapper([ (["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"], [ContinuousDomain(), Imputer()]) ])), ("pca", PCA(n_components=3)), ("selector", SelectKBest(k=2)), ("classifier", LogisticRegression()) ]) pipeline.fit(iris_df, iris_df["Species"]) from sklearn2pmml import sklearn2pmml sklearn2pmml(pipeline, "LogisticRegressionIris.pmml", with_repr=True)
class AutoBuilder: """' E2E classifier builder Builds binary classifier, including: - dataset EDA (optional) - hyperparameter tuning (optional) - model performance assessment - SHAP-based feature analysis - feature selection - creating deployment package (pmml & pkl) Attributes: auto_build (method): automatically builds bin populates output_dir path, with model artifacts, and evaluation charts """ def __init__( self, output_dir_path, csv_path, target_col="target", ignore_cols=[], eda_flag=True, tune_flag=True, cardinality_threshold=100, shap_plot_num=10, shap_frac=0.05, importance_cutoff=0.00, corr_cutoff=0.9, search_space=LGB_SEARCH_SPACE, tuning_iters=25, lgb_params={}, random_state=1234, ): """ Args: output_dir_path (string): filepath where outputs package is created and saved csv_path (string): filepath to input csv, NOTE need to preprocess columns to be numeric or string type target_col (string, optional): target column, default 'target' ignore_cols (iterable, optional): columns to be dropped, default [] eda_flag (boolean, optional): EDA plots to be generated, default True tune_flag (boolean, optional): Lightgbm hyperparameters to be tuned, default True shap_plot_num (numeric, optional): Generate SHAP dependency plots for N most important features, default 10 shap_frac (numeric, optional): Proportion of data sampled for SHAP analysis, default 5% importance_cutoff (numeric, optional): Abs. avg. SHAP value threshold suggest dropping feature, default 0.00 corr_cutoff (numeric, optional): Abs. avg. correlation suggest dropping feature, default 0.9 search_space (numeric, optional): Tuning space for Bayesian optimisation, default is SKOPT_SEARCH_SPACE tuning_iter (numeric, optional): number of tuning iterations for Bayesian optimisation, default is 25, lgb_params (dict, optional): Hyperparams to use in case when tune_flag = False, default None random_state (numeric, optional): Random seed for train test split, and model-training - default is 1234 """ self.output_dir_path = output_dir_path self.csv_path = csv_path self.target_col = target_col self.ignore_cols = ignore_cols self.eda_flag = eda_flag self.tune_flag = tune_flag self.cardinality_threshold = cardinality_threshold self.shap_plot_num = shap_plot_num self.shap_frac = shap_frac self.importance_cutoff = importance_cutoff self.corr_cutoff = corr_cutoff self.search_space = search_space self.tuning_iters = tuning_iters self.lgb_params = lgb_params self.random_state = random_state def _gen_model_dir(self): """ Creates output directory according to self.output_dir_path, removing previous output if there. Also makes subdirectories /bin /plots """ logger.info(f"building directory {self.csv_path}") if os.path.exists(self.output_dir_path) and os.path.isdir( self.output_dir_path): shutil.rmtree(self.output_dir_path) os.mkdir(self.output_dir_path) os.mkdir(self.output_dir_path + "/bin") os.mkdir(self.output_dir_path + "/plots") def _process_csv(self): """ Parses csv specified in self.csv_path, saving to self.raw Also - Drops ignore columns - Validates target and feature columns Target = binary, 0-1 Features = numeric or string """ logger.info(f"loading file {self.csv_path}") raw = pd.read_csv(self.csv_path).drop(columns=self.ignore_cols) logger.info("checking valid input data") assert raw[self.target_col].isna().sum() == 0 assert list(sorted(raw[self.target_col].unique())) == [0, 1] valid_shape = raw.select_dtypes( include=["int64", "float64", "object"]).shape assert valid_shape == raw.shape self.raw = raw raw.to_csv(f"{self.output_dir_path}/bin/raw.csv") def _prepare_X_y(self): """ Splits self raw into X_train y_train, X_test, y_test Also records categorical and numerical columns, and saves csv of training set """ y = self.raw[self.target_col] X = self.raw.drop(columns=self.target_col) logger.info("train test split") self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( X, y, test_size=0.20, random_state=self.random_state) data_train = self.X_train.copy() data_train["target"] = self.y_train training_data_path = f"{self.output_dir_path}/bin/train.csv" data_train.to_csv(training_data_path, index=False) del X, y def _create_categorical_transformer(self): self.categorical_cols = self.X_train.select_dtypes( include=["object"]).columns self.numeric_cols = self.X_train.select_dtypes( include=["int64", "float64"]).columns self.mapper = DataFrameMapper( [([cat_column], [CategoricalDomain(), LabelEncoder()]) for cat_column in self.categorical_cols] + [(self.numeric_cols, ContinuousDomain())]) # hacky, also storing seperated X_train_encoded and classifier, because couldn't get SHAP and skopt to work for e2e pipeline self.X_train_encoded = self.mapper.fit_transform(self.X_train) self.var_names = self.X_train.columns def _tune(self): """ Explores tuning space, updating self.lgb_params with values that minimize cross-validated brier score """ # todo, can I save memory, code and possibly tune binning strats by passing unencoded X_train into pipeline? logger.info(f"tuning {self.tuning_iters}") results = utils.bayes_hyperparam_tune( model=lgb.LGBMClassifier(objective="binary"), X=self.X_train_encoded, y=self.y_train, search_space=self.search_space, n_iters=self.tuning_iters, ) self.lgb_params = results.best_params_ logger.info(f"best params {self.lgb_params}") def _save_model(self): """ Saves sklearn pipeline as pkl and pmml files, also saves training file Args: pipeline (lightgbm pipeline) model to be saved output_dir (string): path to save model outputs train (df): dataset to save """ pmml_path = f"{self.output_dir_path}/model-pmml.pmml" pkl_path = f"{self.output_dir_path}/model-bin.pkl" pickle.dump(self.pipeline, open(pkl_path, "wb")) # sklearn2pmml(self.pipeline, pmml_path) def _generate_shap_plots(self): classifier = lgb.LGBMClassifier(**self.lgb_params) classifier.fit(self.X_train_encoded, self.y_train) X_shap = pd.DataFrame(data=self.X_train_encoded, columns=self.var_names) self.feature_importance = utils.create_shap_plots( classifier, X_shap, output_dir=self.output_dir_path, N=self.shap_plot_num, frac=self.shap_frac, ) def auto_build(self): """ Populates output_dir path, with model artifacts, and evalution charts """ self._gen_model_dir() self._process_csv() self._prepare_X_y() if self.eda_flag: logger.info("EDA") utils.dataset_eda(data=self.X_train, output_dir=self.output_dir_path) self._create_categorical_transformer() if self.tune_flag: self._tune() self._generate_shap_plots() logger.info("creating pipeline") classifier = lgb.LGBMClassifier(**self.lgb_params) self.pipeline = PMMLPipeline([("mapper", self.mapper), ("classifier", classifier)]) self.pipeline.fit(self.X_train, self.y_train) logger.info("Assessing model") y_pred = self.pipeline.predict_proba(self.X_test)[:, 1] y_bm = np.repeat(self.y_train.mean(), self.y_test.shape) utils.evaluate_model(self.y_test, y_pred, y_bm, self.output_dir_path, "Model") logger.info("suggeting features to remove") self.cols_to_remove = utils.find_features_to_remove( importance=self.feature_importance, X=self.X_train, importance_cutoff=self.importance_cutoff, corr_threshold=self.corr_cutoff, ) logger.info(f"candidates to remove - {self.cols_to_remove}") logger.info(f"saving model \n{self.output_dir_path}") self._save_model() test_input = dict(self.X_test.iloc[0]) test_score = self.pipeline.predict_proba(self.X_test.head(1)) logger.info( f"test-case model inputs \n{ test_input } \n model score \n {test_score}" ) logger.info("done!")
build_iris(VotingClassifier([("dt", DecisionTreeClassifier(random_state=13)), ("nb", GaussianNB()), ("lr", LogisticRegression())]), "VotingEnsembleIris", with_proba=False) build_iris(OptimalXGBClassifier(objective="multi:softprob", ntree_limit=7), "XGBIris", ntree_limit=7) if "Iris" in datasets: classifier = RuleSetClassifier( [("X['Petal.Length'] >= 2.45 and X['Petal.Width'] < 1.75", "versicolor"), ("X['Petal.Length'] >= 2.45", "virginica")], default_score="setosa") pipeline = PMMLPipeline([("classifier", classifier)]) pipeline.fit(iris_X, iris_y) pipeline.verify(iris_X.sample(frac=0.10, random_state=13)) store_pkl(pipeline, "RuleSetIris") species = DataFrame(pipeline.predict(iris_X), columns=["Species"]) store_csv(species, "RuleSetIris") # # Text classification # sentiment_X, sentiment_y = load_sentiment("Sentiment") def build_sentiment(classifier, name, with_proba=True, **pmml_options): pipeline = PMMLPipeline([
import pandas as pd from sklearn.preprocessing import OneHotEncoder from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import GaussianNB from sklearn2pmml import sklearn2pmml from sklearn2pmml.decoration import ContinuousDomain, CategoricalDomain from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml.feature_extraction.text import Splitter from sklearn_pandas import DataFrameMapper data = pd.read_csv("test/support/mpg.csv") numeric_features = ["displ", "year", "cyl"] categorical_features = ["class"] text_features = [] mapper = DataFrameMapper( [(numeric_features, [ContinuousDomain()])] + [([f], [CategoricalDomain(), OneHotEncoder()]) for f in categorical_features] + [(f, [CategoricalDomain(), CountVectorizer(tokenizer=Splitter())]) for f in text_features]) pipeline = PMMLPipeline([("mapper", mapper), ("model", GaussianNB())]) pipeline.fit(data, data["drv"]) sklearn2pmml(pipeline, "test/support/python/naive_bayes.pmml") print(pipeline.predict(data[:10]))
# raw_data=raw_data.sample(frac=0.03) # 将非数值型的数据转换为数值型数据 # print("Transforming data...") raw_data[last_column_index], attacks = pd.factorize( raw_data[last_column_index], sort=True) # 对原始数据进行切片,分离出特征和标签,第1~41列是特征,第42列是标签 features = raw_data.iloc[:, :raw_data.shape[1] - 1] # pandas中的iloc切片是完全基于位置的索引 labels = raw_data.iloc[:, raw_data.shape[1] - 1:] # 数据标准化 # features = preprocessing.scale(features) # features = pd.DataFrame(features) # 将多维的标签转为一维的数组 labels = labels.values.ravel() # 将数据分为训练集和测试集,并打印维数 df = pd.DataFrame(features) X_train, X_test, y_train, y_test = train_test_split(df, labels, train_size=0.8, test_size=0.2, stratify=labels) pipeline = PMMLPipeline([("classifier", DecisionTreeClassifier(criterion='entropy', max_depth=12, min_samples_leaf=1, splitter="best"))]) pipeline.fit(X_train, y_train) sklearn2pmml(pipeline, "data/pmml/DecisionTreeIris.pmml", with_repr=True)
feature_preprocessing = compose.ColumnTransformer( [('cat_feature_pipeline', cat_feature_pipeline, cat_features_list), ('num_feature_pipeline', num_feature_pipeline, num_features_list)], n_jobs=10) features_pipeline = pipeline.FeatureUnion( [('pca_selector', decomposition.PCA(n_components=0.90)), ('et_selector', feature_selection.SelectFromModel(ensemble.ExtraTreesClassifier()))], n_jobs=20) classifier = naive_bayes.GaussianNB() #build complete pipeline with feature selection and ml algorithms complete_pipeline = PMMLPipeline([('preprocess', feature_preprocessing), ('zv_filter', feature_selection.VarianceThreshold()), ('features', features_pipeline), ('tree', classifier)]) pipeline_grid = {} grid_estimator = model_selection.GridSearchCV(complete_pipeline, pipeline_grid, scoring="accuracy", cv=5, verbose=10, n_jobs=20) grid_estimator.fit(X_train, y_train) print(grid_estimator.best_estimator_) print(grid_estimator.best_params_) print(grid_estimator.best_score_)
from sklearn.datasets import load_iris from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import train_test_split from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml import sklearn2pmml import pandas as pd import numpy as np data = load_iris() feature_cols = data['feature_names'] df = pd.DataFrame(data=np.c_[data['data'], data['target']], columns=data['feature_names'] + ['target']) X, y = df[feature_cols], df["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.12, random_state=42) print(X_test.head()) print(y_test.head()) pipeline = PMMLPipeline([("classifier", GradientBoostingClassifier())]) pipeline.fit(X_train, y_train) sklearn2pmml( pipeline, "/Users/tcfartunc/Dev/workspaces/python/python_model_server/pmmls/GBIris.pmml", with_repr=True)
FILE_PREFIX, train_transaction)) identity_test_df = pd.read_csv("{}/{}".format(FILE_PREFIX, test_identity)) X_final = pd.read_csv("{}/{}".format(FILE_PREFIX, test_transaction)) print("===============finish file loading=================") Y_train = transaction_train_df_raw['isFraud'] X_train = transaction_train_df_raw.drop('isFraud', axis=1) # 506691 preprocessor = PreProcessor(transaction_train_df_raw, identity_train_df) X_train_after_processing = preprocessor.preprocess() model = RandomForestClassifier(n_estimators=100, random_state=0) #my_pipeline = Pipeline(steps=[('preprocessor', preprocessor_pipeline), ('model', model)]) # Preprocessing of training data, fit model # my_pipeline.fit(X_train, Y_train) #my_pipeline.fit(X_train, Y_train) from sklearn2pmml.pipeline import PMMLPipeline pipeline = PMMLPipeline([ #("preprocessing", dataFrameMapper), ("classifier", model) ]) pipeline.fit(X_train_after_processing, Y_train) from sklearn2pmml import sklearn2pmml sklearn2pmml(pipeline, "model.pmml", with_repr=True) # Preprocessing of validation data, get predictions #preds = my_pipeline.predict(X_test)
from sklearn import tree from sklearn.datasets import load_iris from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml import sklearn2pmml if __name__ == '__main__': fp = "iris.pmml" iris = load_iris() # load the famous dat set X = iris.data # features with four variables y = iris.target # training target pipeline = PMMLPipeline([("classifier", tree.DecisionTreeClassifier())]) pipeline.fit(X, y) # fit the model sklearn2pmml(pipeline, fp, with_repr=True) # create output file # features and targets print(f'features: {X[0, :]}, target: {y[0]}') print(f'features: {X[1, :]}, target: {y[1]}')
def build_iris(classifier, name, **pmml_options): cont_columns = ["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"] cont_mappings = [([cont_column], ContinuousDomain()) for cont_column in cont_columns] mapper = DataFrameMapper(cont_mappings) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ]) pipeline.fit(iris_X, iris_y) if isinstance(classifier, XGBClassifier): pipeline.verify(iris_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(iris_X.sample(n = 3, random_state = 13)) pipeline.configure(**pmml_options) store_pmml(pipeline, name) species = DataFrame(pipeline.predict(iris_X), columns = ["Species"]) species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"]) store_csv(pandas.concat((species, species_proba), axis = 1), name)
def build_audit_na(classifier, name, with_proba=True, **kwargs): employment_mapping = { "CONSULTANT": "PRIVATE", "PSFEDERAL": "PUBLIC", "PSLOCAL": "PUBLIC", "PSSTATE": "PUBLIC", "SELFEMP": "PRIVATE", "PRIVATE": "PRIVATE" } gender_mapping = {"FEMALE": 0, "MALE": 1} mapper = DataFrameMapper([(["Age"], [ ContinuousDomain(missing_values=None, with_data=False), Alias(ExpressionTransformer( "numpy.where(pandas.notnull(X[:, 0]), X[:, 0], -999)"), name="flag_missing(Age, -999)"), Imputer(missing_values=-999) ])] + [(["Hours"], [ ContinuousDomain(missing_values=None, with_data=False), Alias(ExpressionTransformer( "numpy.where(pandas.isnull(X[:, 0]), -999, X[:, 0])"), name="flag_missing(Hours, -999)"), Imputer(missing_values=-999) ])] + [(["Income"], [ ContinuousDomain(missing_values=None, outlier_treatment="as_missing_values", low_value=5000, high_value=200000, with_data=False), Imputer() ])] + [(["Employment"], [ CategoricalDomain(missing_values=None, with_data=False), CategoricalImputer(), StringNormalizer(function="uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function="lowercase"), PMMLLabelBinarizer() ])] + [([column], [ CategoricalDomain(missing_values=None, with_data=False), CategoricalImputer(missing_values=None), StringNormalizer(function="lowercase"), PMMLLabelBinarizer() ]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [ CategoricalDomain(missing_values=None, with_data=False), CategoricalImputer(), StringNormalizer(function="uppercase"), LookupTransformer(gender_mapping, None) ])]) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_na_X, audit_na_y) customize(classifier, **kwargs) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame( pipeline.predict_proba(audit_na_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns=["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis=1) store_csv(adjusted, name + ".csv")
def build_auto(regressor, name, **pmml_options): cat_columns = ["cylinders", "model_year", "origin"] cont_columns = ["displacement", "horsepower", "weight", "acceleration"] if isinstance(regressor, LGBMRegressor): cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns] else: cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns] cont_mappings = [([cont_column], [cont_domain(name)]) for cont_column in cont_columns] mapper = DataFrameMapper(cat_mappings + cont_mappings) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) if isinstance(regressor, LGBMRegressor): pipeline.fit(auto_X, auto_y, regressor__categorical_feature = [0, 1, 2]) elif isinstance(regressor, IsolationForest): pipeline.fit(auto_X) else: pipeline.fit(auto_X, auto_y) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(n = 3, random_state = 13)) pipeline.configure(**pmml_options) store_pmml(pipeline, name) if isinstance(regressor, IsolationForest): decision_function = DataFrame(pipeline.decision_function(auto_X), columns = ["decisionFunction"]) outlier = DataFrame(pipeline.predict(auto_X), columns = ["outlier"]) outlier['outlier'] = outlier['outlier'].apply(lambda x: str(bool(x == -1)).lower()) store_csv(pandas.concat((decision_function, outlier), axis = 1), name) else: mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name)
iris = load_iris() data = iris.data target = iris.target # os.environ["PATH"] += os.pathsep + 'C:/Program Files/Java/jdk1.8.0_171/bin' # X=[[1,2,3,1],[2,4,1,5],[7,8,3,6],[4,8,4,7],[2,5,6,9]] # y=[0,1,0,2,1] # pipeline = PMMLPipeline([("classifier", tree.DecisionTreeClassifier(random_state=9))]); # pipeline.fit(data,target) # sklearn2pmml(pipeline, "tree_result.pmml") # url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv" # names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] # dataframe = pd.read_csv(url, names=names) # print(dataframe) # array = dataframe.values # X = array[:,0:8] # Y = array[:,8] test_size = 0.33 seed = 7 X_train, X_test, Y_train, Y_test = model_selection.train_test_split( data, target, test_size=test_size, random_state=seed) # Fit the model on training set # model = LogisticRegression() # model.fit(X_train, Y_train) pipeline = PMMLPipeline([("classifier", LogisticRegression())]) pipeline.fit(X_train, Y_train) sklearn2pmml(pipeline, "logit_result.pmml")
def build_audit(classifier, name, **pmml_options): if isinstance(classifier, LGBMClassifier): cat_columns = ["Age", "Employment", "Education", "Marital", "Occupation", "Gender"] cont_columns = ["Income", "Hours"] else: cat_columns = ["Employment", "Education", "Marital", "Occupation", "Gender"] cont_columns = ["Age", "Income", "Hours"] if isinstance(classifier, LGBMClassifier): cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns] else: cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns] cont_mappings = [([cont_column], cont_domain(name)) for cont_column in cont_columns] mapper = DataFrameMapper(cat_mappings + cont_mappings) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ]) if isinstance(classifier, LGBMClassifier): pipeline.fit(audit_X, audit_y, classifier__categorical_feature = [0, 1, 2, 3, 4, 5]) elif isinstance(classifier, XGBClassifier): if name == "XGBoostAuditNA": audit_X["Age"] = audit_X["Age"].astype(float) pipeline.fit(audit_X, audit_y) else: pipeline.fit(audit_X, audit_y) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_X.sample(n = 3, random_state = 13)) pipeline.configure(**pmml_options) store_pmml(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"]) adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"]) store_csv(pandas.concat((adjusted, adjusted_proba), axis = 1), name)
mapper = DataFrameMapper( [(cat_column, [CategoricalDomain(invalid_value_treatment="as_is"), LabelBinarizer()]) for cat_column in cat_columns] + [([cont_column], [ContinuousDomain(invalid_value_treatment="as_is"), StandardScaler()]) for cont_column in cont_columns]) selector = SelectKBest() classifier = LogisticRegression(multi_class="ovr", penalty="elasticnet", solver="saga", max_iter=1000) pipeline = PMMLPipeline([("mapper", mapper), ("selector", selector), ("classifier", classifier)]) param_grid = { "selector__k": [10, 20, 30], "classifier__l1_ratio": [0.7, 0.8, 0.9] } searcher = GridSearchCV(estimator=pipeline, param_grid=param_grid) searcher.fit(df_X, df_y) print(searcher.best_params_) best_pipeline = searcher.best_estimator_ best_pipeline.verify(df_X.sample(n=5)) sklearn2pmml(best_pipeline, "GridSearchAudit.pmml")
print('accuracy(准确率):', accuracy) maxtrix = sklearn.metrics.confusion_matrix(pred_data['y_test'], pred_data['y_predict']) plt.matshow(maxtrix) plt.colorbar() plt.xlabel('predict type') plt.ylabel('true type') plt.show() """ 方法二、用网格搜索方法寻找最优模型参数 """ svc2 = sklearn.svm.SVC() # 制定待训练的每个参数的若干备选值 param = {"C": [0.2, 0.4, 0.8, 1, 1.2, 1.6, 2], "kernel": ['rbf', 'linear'], "gamma": ['auto', 0.01, 0.05, 0.2, 0.4, 0.5, 1, 1.6, 2]} gscv = sklearn.model_selection.GridSearchCV(svc2, param_grid=param, cv=3) gscv = gscv.fit(X_train, y_train) print("best score: {}".format(gscv.best_score_)) print("best params: {}".format(gscv.best_params_)) """ 4.把模型导出至pmml文件,可被用于后续java系统,这里要提前装jdk貌似,暂时运行不了 """ pipeline = PMMLPipeline([("classifier", svc1)]) pipeline.fit(X_train, y_train) sklearn2pmml(pipeline, ".\\demo.pmml", with_repr=True)
if __name__ == '__main__': samples = get_jewellery_data() X = samples.drop(['id'], axis=1) Y = samples["id"] print("data done!") pipeline = PMMLPipeline([ ('mapper', DataFrameMapper([('name', TfidfVectorizer(norm=None, analyzer="word", max_features=200, tokenizer=Splitter())), ('description', TfidfVectorizer(norm=None, analyzer="word", max_features=600, tokenizer=Splitter()))])), ('model', SVC(max_iter=10000) ), # train on TF-IDF vectors w/ Linear SVM classifier ]) print("model set done!") pipeline.fit(X, Y) print("model fit done!") sklearn2pmml(pipeline, "../model/pmml_for_jewellery_second.pmml", with_repr=True)
from sklearn import XGBClassifier, XGBRegressor from sklearn2pmml import sklearn2pmml from sklearn2pmml.pipeline import PMMLPipeline from sklearn.preprocessing import LabelEncoder iris_df = pandas.read_csv("iris.csv") iris_df['species'] = LabelEncoder().fit_transform(iris_df['species'].values) #iris_df.columns #['sepal_length', 'sepal_width', 'petal_length', 'petal_width','species'] clf = XGBClassifier( silent=0, #设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息。 #nthread=4,# cpu 线程数 默认最大 learning_rate=0.3, # 如同学习率 min_child_weight=1, # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言 #,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。 #这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。 max_depth=6, # 构建树的深度,越大越容易过拟合 gamma=0, # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这样子。 subsample=1, # 随机采样训练样本 训练实例的子采样比 max_delta_step=0, #最大增量步长,我们允许每个树的权重估计。 colsample_bytree=1, # 生成树时进行的列采样 reg_lambda=1, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 objective='multi:softmax', #多分类的问题 指定学习任务和相应的学习目标 n_estimators=100, #树的个数 seed=1000) pipeline = PMMLPipeline([("classifier", clf)]) pipeline.fit(iris_df[iris_df.columns.difference(["species"])], iris_df["species"]) sklearn2pmml(pipeline, "xgboost.pmml", with_repr=True)
def build_audit_na(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, predict_transformer = None, predict_proba_transformer = None, apply_transformer = None, **pmml_options): employment_mapping = { "CONSULTANT" : "PRIVATE", "PSFEDERAL" : "PUBLIC", "PSLOCAL" : "PUBLIC", "PSSTATE" : "PUBLIC", "SELFEMP" : "PRIVATE", "PRIVATE" : "PRIVATE" } gender_mapping = { "FEMALE" : 0.0, "MALE" : 1.0, "MISSING_VALUE" : 0.5 } mapper = DataFrameMapper( [(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999", dtype = int), name = "flag_missing(Age, -999)"), SimpleImputer(missing_values = -999, strategy = "constant", fill_value = 38)])] + [(["Age"], MissingIndicator())] + [(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), SimpleImputer(missing_values = -999, add_indicator = True)])] + [(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), SimpleImputer(strategy = "median", add_indicator = True)])] + [(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] + [([column], [CategoricalDomain(missing_values = None, missing_value_replacement = "N/A", with_data = False), SimpleImputer(missing_values = "N/A", strategy = "most_frequent"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), SimpleImputer(strategy = "constant"), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ], predict_transformer = predict_transformer, predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer) pipeline.fit(audit_na_X, audit_na_y, **fit_params) pipeline.configure(**pmml_options) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1) store_csv(adjusted, name)
import pandas import pandas iris_df = pandas.read_csv("Iris.csv") from sklearn.tree import DecisionTreeClassifier from sklearn2pmml.pipeline import PMMLPipeline pipeline = PMMLPipeline([("classifier", DecisionTreeClassifier())]) pipeline.fit(iris_df[iris_df.columns.difference(["Species"])], iris_df["Species"]) from sklearn2pmml import sklearn2pmml sklearn2pmml(pipeline, "DecisionTreeIris.pmml", with_repr=True)
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options): cylinders_origin_mapping = { (8, 1) : "8/1", (6, 1) : "6/1", (4, 1) : "4/1", (6, 2) : "6/2", (4, 2) : "4/2", (4, 3) : "4/3" } mapper = DataFrameMapper([ (["cylinders", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), LabelBinarizer()]), (["cylinders"], Alias(ExpressionTransformer("X[0] % 2.0 > 0.0"), name = "odd(cylinders)", prefit = True)), (["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects (["model_year", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), ConcatTransformer("/"), LabelBinarizer(), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]), (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]), (["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {"alias" : "weight / displacement + 0.5"}) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y, **fit_params) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"]) store_csv(mpg, name)
import pandas as pd import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn import tree from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml import sklearn2pmml from sklearn.datasets import load_iris import os iris = load_iris() data = iris.data target = iris.target # os.environ["PATH"] += os.pathsep + 'C:/Program Files/Java/jdk1.8.0_171/bin' # X=[[1,2,3,1],[2,4,1,5],[7,8,3,6],[4,8,4,7],[2,5,6,9]] # y=[0,1,0,2,1] pipeline = PMMLPipeline([("classifier", tree.DecisionTreeClassifier(random_state=9))]) pipeline.fit(data, target) sklearn2pmml(pipeline, "tree_result.pmml")