''' @Author: Runsen @微信公众号: 润森笔记 @博客: https://blog.csdn.net/weixin_44510615 @Date: 2020/5/24 ''' from sklearn.datasets import load_iris from sklearn.decomposition import PCA from sklearn.svm import SVC from sklearn2pmml import sklearn2pmml from sklearn2pmml.pipeline import PMMLPipeline pipeline = PMMLPipeline([("pca", PCA(n_components=3)), ("classifier", SVC())]) iris = load_iris() pipeline.fit(iris.data, iris.target) sklearn2pmml(pipeline, "iris_SVC.pmml", with_repr=True)
def build_audit_na(classifier, name, with_proba=True, predict_transformer=None, predict_proba_transformer=None, apply_transformer=None, **pmml_options): employment_mapping = { "CONSULTANT": "PRIVATE", "PSFEDERAL": "PUBLIC", "PSLOCAL": "PUBLIC", "PSSTATE": "PUBLIC", "SELFEMP": "PRIVATE", "PRIVATE": "PRIVATE" } gender_mapping = {"FEMALE": 0.0, "MALE": 1.0, "MISSING_VALUE": 0.5} mapper = DataFrameMapper([(["Age"], [ ContinuousDomain(missing_values=None, with_data=False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999"), name="flag_missing(Age, -999)"), Imputer(missing_values=-999) ])] + [(["Age"], MissingIndicator())] + [(["Hours"], [ ContinuousDomain(missing_values=None, with_data=False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name="flag_missing(Hours, -999)"), SimpleImputer(missing_values=-999, add_indicator=True) ])] + [(["Income"], [ ContinuousDomain(missing_values=None, outlier_treatment="as_missing_values", low_value=5000, high_value=200000, with_data=False), SimpleImputer(strategy="median", add_indicator=True) ])] + [(["Employment"], [ CategoricalDomain(missing_values=None, with_data=False), CategoricalImputer(missing_values=None), StringNormalizer(function="uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function="lowercase"), PMMLLabelBinarizer() ])] + [([column], [ CategoricalDomain(missing_values=None, missing_value_replacement="N/A", with_data=False), SimpleImputer(missing_values="N/A", strategy="most_frequent"), StringNormalizer(function="lowercase"), PMMLLabelBinarizer() ]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [ CategoricalDomain(missing_values=None, with_data=False), SimpleImputer(strategy="constant"), StringNormalizer(function="uppercase"), LookupTransformer(gender_mapping, None) ])]) pipeline = PMMLPipeline( [("mapper", mapper), ("classifier", classifier)], predict_transformer=predict_transformer, predict_proba_transformer=predict_proba_transformer, apply_transformer=apply_transformer) pipeline.fit(audit_na_X, audit_na_y) pipeline.configure(**pmml_options) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame( pipeline.predict_proba(audit_na_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns=["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis=1) store_csv(adjusted, name)
('mapper', DataFrameMapper([('name', TfidfVectorizer(norm=None, analyzer="word", max_features=500, tokenizer=Splitter())), ('description', TfidfVectorizer(norm=None, analyzer="word", max_features=1000, tokenizer=Splitter()))])), ('model', SVC(max_iter=10000)), # train on TF-IDF vectors w/ Linear SVM classifier ]) print("model set done!") pipeline.fit(X, Y) print("model fit done!") c = pd.read_csv(r"C:\Users\钟顺民\Desktop\handbags.csv", sep=',', encoding='ISO-8859-1').dropna().sample(n=200) prediction = pipeline.predict(c.drop(['id'], axis=1)) t = c['id'] print("Accuracy Score ->", accuracy_score(prediction, t) * 100) """ Accuracy Score -> 98.0 """ # print(accuracy_score(prediction, Test_Y) * 100) # Test_X.to_csv(r"C:\Users\钟顺民\Desktop\test.csv") # df.to_csv(r"C:\Users\钟顺民\Desktop\3.csv")
class AutoBuilder: """' E2E classifier builder Builds binary classifier, including: - dataset EDA (optional) - hyperparameter tuning (optional) - model performance assessment - SHAP-based feature analysis - feature selection - creating deployment package (pmml & pkl) Attributes: auto_build (method): automatically builds bin populates output_dir path, with model artifacts, and evaluation charts """ def __init__( self, output_dir_path, csv_path, target_col="target", ignore_cols=[], eda_flag=True, tune_flag=True, cardinality_threshold=100, shap_plot_num=10, shap_frac=0.05, importance_cutoff=0.00, corr_cutoff=0.9, search_space=LGB_SEARCH_SPACE, tuning_iters=25, lgb_params={}, random_state=1234, ): """ Args: output_dir_path (string): filepath where outputs package is created and saved csv_path (string): filepath to input csv, NOTE need to preprocess columns to be numeric or string type target_col (string, optional): target column, default 'target' ignore_cols (iterable, optional): columns to be dropped, default [] eda_flag (boolean, optional): EDA plots to be generated, default True tune_flag (boolean, optional): Lightgbm hyperparameters to be tuned, default True shap_plot_num (numeric, optional): Generate SHAP dependency plots for N most important features, default 10 shap_frac (numeric, optional): Proportion of data sampled for SHAP analysis, default 5% importance_cutoff (numeric, optional): Abs. avg. SHAP value threshold suggest dropping feature, default 0.00 corr_cutoff (numeric, optional): Abs. avg. correlation suggest dropping feature, default 0.9 search_space (numeric, optional): Tuning space for Bayesian optimisation, default is SKOPT_SEARCH_SPACE tuning_iter (numeric, optional): number of tuning iterations for Bayesian optimisation, default is 25, lgb_params (dict, optional): Hyperparams to use in case when tune_flag = False, default None random_state (numeric, optional): Random seed for train test split, and model-training - default is 1234 """ self.output_dir_path = output_dir_path self.csv_path = csv_path self.target_col = target_col self.ignore_cols = ignore_cols self.eda_flag = eda_flag self.tune_flag = tune_flag self.cardinality_threshold = cardinality_threshold self.shap_plot_num = shap_plot_num self.shap_frac = shap_frac self.importance_cutoff = importance_cutoff self.corr_cutoff = corr_cutoff self.search_space = search_space self.tuning_iters = tuning_iters self.lgb_params = lgb_params self.random_state = random_state def _gen_model_dir(self): """ Creates output directory according to self.output_dir_path, removing previous output if there. Also makes subdirectories /bin /plots """ logger.info(f"building directory {self.csv_path}") if os.path.exists(self.output_dir_path) and os.path.isdir( self.output_dir_path): shutil.rmtree(self.output_dir_path) os.mkdir(self.output_dir_path) os.mkdir(self.output_dir_path + "/bin") os.mkdir(self.output_dir_path + "/plots") def _process_csv(self): """ Parses csv specified in self.csv_path, saving to self.raw Also - Drops ignore columns - Validates target and feature columns Target = binary, 0-1 Features = numeric or string """ logger.info(f"loading file {self.csv_path}") raw = pd.read_csv(self.csv_path).drop(columns=self.ignore_cols) logger.info("checking valid input data") assert raw[self.target_col].isna().sum() == 0 assert list(sorted(raw[self.target_col].unique())) == [0, 1] valid_shape = raw.select_dtypes( include=["int64", "float64", "object"]).shape assert valid_shape == raw.shape self.raw = raw raw.to_csv(f"{self.output_dir_path}/bin/raw.csv") def _prepare_X_y(self): """ Splits self raw into X_train y_train, X_test, y_test Also records categorical and numerical columns, and saves csv of training set """ y = self.raw[self.target_col] X = self.raw.drop(columns=self.target_col) logger.info("train test split") self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( X, y, test_size=0.20, random_state=self.random_state) data_train = self.X_train.copy() data_train["target"] = self.y_train training_data_path = f"{self.output_dir_path}/bin/train.csv" data_train.to_csv(training_data_path, index=False) del X, y def _create_categorical_transformer(self): self.categorical_cols = self.X_train.select_dtypes( include=["object"]).columns self.numeric_cols = self.X_train.select_dtypes( include=["int64", "float64"]).columns self.mapper = DataFrameMapper( [([cat_column], [CategoricalDomain(), LabelEncoder()]) for cat_column in self.categorical_cols] + [(self.numeric_cols, ContinuousDomain())]) # hacky, also storing seperated X_train_encoded and classifier, because couldn't get SHAP and skopt to work for e2e pipeline self.X_train_encoded = self.mapper.fit_transform(self.X_train) self.var_names = self.X_train.columns def _tune(self): """ Explores tuning space, updating self.lgb_params with values that minimize cross-validated brier score """ # todo, can I save memory, code and possibly tune binning strats by passing unencoded X_train into pipeline? logger.info(f"tuning {self.tuning_iters}") results = utils.bayes_hyperparam_tune( model=lgb.LGBMClassifier(objective="binary"), X=self.X_train_encoded, y=self.y_train, search_space=self.search_space, n_iters=self.tuning_iters, ) self.lgb_params = results.best_params_ logger.info(f"best params {self.lgb_params}") def _save_model(self): """ Saves sklearn pipeline as pkl and pmml files, also saves training file Args: pipeline (lightgbm pipeline) model to be saved output_dir (string): path to save model outputs train (df): dataset to save """ pmml_path = f"{self.output_dir_path}/model-pmml.pmml" pkl_path = f"{self.output_dir_path}/model-bin.pkl" pickle.dump(self.pipeline, open(pkl_path, "wb")) # sklearn2pmml(self.pipeline, pmml_path) def _generate_shap_plots(self): classifier = lgb.LGBMClassifier(**self.lgb_params) classifier.fit(self.X_train_encoded, self.y_train) X_shap = pd.DataFrame(data=self.X_train_encoded, columns=self.var_names) self.feature_importance = utils.create_shap_plots( classifier, X_shap, output_dir=self.output_dir_path, N=self.shap_plot_num, frac=self.shap_frac, ) def auto_build(self): """ Populates output_dir path, with model artifacts, and evalution charts """ self._gen_model_dir() self._process_csv() self._prepare_X_y() if self.eda_flag: logger.info("EDA") utils.dataset_eda(data=self.X_train, output_dir=self.output_dir_path) self._create_categorical_transformer() if self.tune_flag: self._tune() self._generate_shap_plots() logger.info("creating pipeline") classifier = lgb.LGBMClassifier(**self.lgb_params) self.pipeline = PMMLPipeline([("mapper", self.mapper), ("classifier", classifier)]) self.pipeline.fit(self.X_train, self.y_train) logger.info("Assessing model") y_pred = self.pipeline.predict_proba(self.X_test)[:, 1] y_bm = np.repeat(self.y_train.mean(), self.y_test.shape) utils.evaluate_model(self.y_test, y_pred, y_bm, self.output_dir_path, "Model") logger.info("suggeting features to remove") self.cols_to_remove = utils.find_features_to_remove( importance=self.feature_importance, X=self.X_train, importance_cutoff=self.importance_cutoff, corr_threshold=self.corr_cutoff, ) logger.info(f"candidates to remove - {self.cols_to_remove}") logger.info(f"saving model \n{self.output_dir_path}") self._save_model() test_input = dict(self.X_test.iloc[0]) test_score = self.pipeline.predict_proba(self.X_test.head(1)) logger.info( f"test-case model inputs \n{ test_input } \n model score \n {test_score}" ) logger.info("done!")
DecisionTreeClassifier(random_state=13)), ("nb", GaussianNB()), ("lr", LogisticRegression())]), "VotingEnsembleIris", with_proba=False) build_iris(OptimalXGBClassifier(objective="multi:softprob", ntree_limit=7), "XGBIris", ntree_limit=7) if "Iris" in datasets: classifier = RuleSetClassifier( [("X['Petal.Length'] >= 2.45 and X['Petal.Width'] < 1.75", "versicolor"), ("X['Petal.Length'] >= 2.45", "virginica")], default_score="setosa") pipeline = PMMLPipeline([("classifier", classifier)]) pipeline.fit(iris_X, iris_y) pipeline.verify(iris_X.sample(frac=0.10, random_state=13)) store_pkl(pipeline, "RuleSetIris") species = DataFrame(pipeline.predict(iris_X), columns=["Species"]) store_csv(species, "RuleSetIris") # # Text classification # sentiment_X, sentiment_y = load_sentiment("Sentiment") def build_sentiment(classifier, name, with_proba=True, **pmml_options): pipeline = PMMLPipeline([ ("tf-idf",
class url(object): def __init__(self): #读取数据 good_query_list = self.get_query_list('goodqueries.txt') bad_query_list = self.get_query_list('badqueries.txt') #给黑、白数据分别打标签 good_y = [0 for i in range(0, len(good_query_list))] bad_y = [1 for i in range(0, len(bad_query_list))] queries = good_query_list + bad_query_list y = good_y + bad_y #将原始文本数据分割转化成向量 self.vectorizer = TfidfVectorizer(tokenizer=self.get_ngrams) #把文本字符串转化成([i,j],Tfidf值)矩阵X X = self.vectorizer.fit_transform(queries) #分割训练数据(建立模型)和测试数据(测试模型准确度) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=20, random_state=42) #定义模型训练方法(逻辑回归) self.lgs = PMMLPipeline([('LogisticModer', LogisticRegression(solver='liblinear'))]) #训练模型 self.lgs.fit(X_train, y_train) #测试模型准确度 print('模型准确度:{}'.format(self.lgs.score(X_test, y_test))) sklearn2pmml(self.lgs, '.\lgs.pmml', with_repr=True) #获取文本中的请求列表 def get_query_list(self, filename): directory = str(os.getcwd()) + '\\data\\train' filepath = directory + '\\' + filename data = open(filepath, 'r', encoding='utf-8').readlines() query_list = [] for d in data: d = str(urllib.parse.unquote(d)) query_list.append(d) return list(set(query_list)) def test_query_list(self, filename): directory = str(os.getcwd()) + '\\data\\test' filepath = directory + '\\' + filename data = open(filepath, 'r', encoding='utf-8').readlines() query_list = [] for d in data: d = str(urllib.parse.unquote(d)) query_list.append(d) return list(set(query_list)) #分割字符串,每3个字符作一次分割 def get_ngrams(self, query): tempQuery = str(query) ngrams = [] for i in range(0, len(tempQuery) - 3): ngrams.append(tempQuery[i:i + 3]) return ngrams #预测新的url def predict(self, newQueries): newQueries = [urllib.parse.unquote(url) for url in newQueries] X_predict = self.vectorizer.transform(newQueries) res = self.lgs.predict(X_predict) res_list = [] for q, r in zip(newQueries, res): tmp = '正常请求' if r == 0 else '恶意请求' q_entity = html.escape(q) res_list.append({'url': q_entity, 'res': tmp}) print("预测的结果列表:{}".format(str(res_list))) return res_list
import pandas as pd from sklearn.preprocessing import OneHotEncoder from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import GaussianNB from sklearn2pmml import sklearn2pmml from sklearn2pmml.decoration import ContinuousDomain, CategoricalDomain from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml.feature_extraction.text import Splitter from sklearn_pandas import DataFrameMapper data = pd.read_csv("test/support/mpg.csv") numeric_features = ["displ", "year", "cyl"] categorical_features = ["class"] text_features = [] mapper = DataFrameMapper( [(numeric_features, [ContinuousDomain()])] + [([f], [CategoricalDomain(), OneHotEncoder()]) for f in categorical_features] + [(f, [CategoricalDomain(), CountVectorizer(tokenizer=Splitter())]) for f in text_features]) pipeline = PMMLPipeline([("mapper", mapper), ("model", GaussianNB())]) pipeline.fit(data, data["drv"]) sklearn2pmml(pipeline, "test/support/python/naive_bayes.pmml") print(pipeline.predict(data[:10]))
import pandas as pd from sklearn.preprocessing import OneHotEncoder from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LinearRegression from sklearn2pmml import sklearn2pmml from sklearn2pmml.decoration import ContinuousDomain, CategoricalDomain from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml.feature_extraction.text import Splitter from sklearn_pandas import DataFrameMapper data = pd.read_csv("test/support/mpg.csv") numeric_features = ["displ", "year", "cyl"] categorical_features = ["drv", "class"] text_features = ["model"] mapper = DataFrameMapper( [(numeric_features, [ContinuousDomain()])] + [([f], [CategoricalDomain(), OneHotEncoder()]) for f in categorical_features] + [(f, [ CategoricalDomain(), CountVectorizer(tokenizer=Splitter(), max_features=5) ]) for f in text_features]) pipeline = PMMLPipeline([("mapper", mapper), ("model", LinearRegression())]) pipeline.fit(data, data["hwy"]) sklearn2pmml(pipeline, "test/support/python/linear_regression_text.pmml") print(list(pipeline.predict(data[:10])))
def build_audit_na(classifier, name, with_proba=True, **kwargs): employment_mapping = { "CONSULTANT": "PRIVATE", "PSFEDERAL": "PUBLIC", "PSLOCAL": "PUBLIC", "PSSTATE": "PUBLIC", "SELFEMP": "PRIVATE", "PRIVATE": "PRIVATE" } gender_mapping = {"FEMALE": 0, "MALE": 1} mapper = DataFrameMapper([(["Age"], [ ContinuousDomain(missing_values=None, with_data=False), Alias(ExpressionTransformer( "numpy.where(pandas.notnull(X[:, 0]), X[:, 0], -999)"), name="flag_missing(Age, -999)"), Imputer(missing_values=-999) ])] + [(["Hours"], [ ContinuousDomain(missing_values=None, with_data=False), Alias(ExpressionTransformer( "numpy.where(pandas.isnull(X[:, 0]), -999, X[:, 0])"), name="flag_missing(Hours, -999)"), Imputer(missing_values=-999) ])] + [(["Income"], [ ContinuousDomain(missing_values=None, outlier_treatment="as_missing_values", low_value=5000, high_value=200000, with_data=False), Imputer() ])] + [(["Employment"], [ CategoricalDomain(missing_values=None, with_data=False), CategoricalImputer(), StringNormalizer(function="uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function="lowercase"), PMMLLabelBinarizer() ])] + [([column], [ CategoricalDomain(missing_values=None, with_data=False), CategoricalImputer(missing_values=None), StringNormalizer(function="lowercase"), PMMLLabelBinarizer() ]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [ CategoricalDomain(missing_values=None, with_data=False), CategoricalImputer(), StringNormalizer(function="uppercase"), LookupTransformer(gender_mapping, None) ])]) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_na_X, audit_na_y) customize(classifier, **kwargs) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame( pipeline.predict_proba(audit_na_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns=["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis=1) store_csv(adjusted, name + ".csv")
try: data = pd.read_csv(csv_url, sep=";") except Exception as e: logger.exception("Unable to download training & test CSV, " f"check your internet connection. Error: {e}") exit(1) train, test = train_test_split(data) train_x = train.drop(["quality"], axis=1) test_x = test.drop(["quality"], axis=1) train_y = train[["quality"]] test_y = test[["quality"]] lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42) lr.pmml_name_ = f"PMML-ElasticnetWineModel-{model_id}" pipeline = PMMLPipeline(steps=[("elastic_net", lr)]) pipeline.fit(train_x, train_y) predicted_qualities = pipeline.predict(test_x) (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities) print("Elasticnet model (alpha={}, l1_ratio={}):".format(alpha, l1_ratio)) print(" RMSE: {}".format(rmse)) print(" MAE: {}".format(mae)) print(" R2: {}".format(r2)) sklearn2pmml(pipeline, output_path, with_repr=True) print(f"Elasticnet model (alpha={alpha}, l1_ratio={l1_ratio}) exported")
import pandas as pd import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn import tree from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml import sklearn2pmml from sklearn.datasets import load_iris import os iris = load_iris() data = iris.data target = iris.target # os.environ["PATH"] += os.pathsep + 'C:/Program Files/Java/jdk1.8.0_171/bin' # X=[[1,2,3,1],[2,4,1,5],[7,8,3,6],[4,8,4,7],[2,5,6,9]] # y=[0,1,0,2,1] pipeline = PMMLPipeline([("classifier", tree.DecisionTreeClassifier(random_state=9))]) pipeline.fit(data, target) sklearn2pmml(pipeline, "tree_result.pmml")
labels = labels.values.ravel() # 将数据分为训练集和测试集,并打印维数 df = pd.DataFrame(features) X_train, X_test, y_train, y_test = train_test_split(df, labels, train_size=0.8, test_size=0.2) # n_estimators:森林中树的个数;n_jobs:并行作业的数量,为-1时是处理器的核数; random_state:随机种子 print("Training model...") clf = RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=100) trained_model = clf.fit(X_train, y_train) print("Score:", trained_model.score(X_train, y_train)) model = PMMLPipeline([('RandomForest', RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=100))]) model.fit(X_train,y_train) sklearn2pmml(model, './save_model/RandomForest.pmml', with_repr=True) # predicting print("Predicting...") y_pred = clf.predict(X_test) print("Computing performance metrics...") results = confusion_matrix(y_test, y_pred) error = zero_one_loss(y_test, y_pred) # 根据混淆矩阵求预测精度 list_diag = np.diag(results) list_raw_sum = np.sum(results, axis=1) print("Predict accuracy: ", np.mean(list_diag) / np.mean(list_raw_sum))
import numpy as np import pandas as pd from sklearn import tree from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml import sklearn2pmml import os # os.environ["PATH"] += os.pathsep + 'C:/Program Files/Java/jdk1.8.0_171/bin' X = [[1, 2, 3, 1], [2, 4, 1, 5], [7, 8, 3, 6], [4, 8, 4, 7], [2, 5, 6, 9]] y = [0, 1, 0, 2, 1] pipeline = PMMLPipeline([("classifier", tree.DecisionTreeClassifier(random_state=9))]) pipeline.fit(X, y) # pip install --user --upgrade git+https://github.com/jpmml/sklearn2pmml.git sklearn2pmml(pipeline, "demo.pmml", with_repr=True) from sklearn.externals import joblib joblib.dump(pipeline, "pipeline.pkl.z", compress=9) # java -jar target/jpmml-sklearn-executable-1.5-SNAPSHOT.jar --pkl-input pipeline.pkl.z --pmml-output pipeline.pmml
from lightgbm import LGBMRegressor from sklearn2pmml import sklearn2pmml from sklearn2pmml.decoration import ContinuousDomain, CategoricalDomain from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml.preprocessing import PMMLLabelEncoder from sklearn2pmml.feature_extraction.text import Splitter from sklearn_pandas import DataFrameMapper data = pd.read_csv("test/support/mpg.csv") numeric_features = ["displ", "year", "cyl"] categorical_features = ["drv", "class"] text_features = ["model"] mapper = DataFrameMapper( [(numeric_features, [ContinuousDomain()])] + [([f], [CategoricalDomain(), PMMLLabelEncoder()]) for f in categorical_features] + [(f, [CategoricalDomain(), CountVectorizer(tokenizer=Splitter(), max_features=5)]) for f in text_features] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("model", LGBMRegressor(n_estimators=1000)) ]) # use model__sample_weight for weight pipeline.fit(data, data["hwy"], model__categorical_feature=[3, 4]) sklearn2pmml(pipeline, "test/support/python/lightgbm_regression.pmml") print(pipeline.predict(data[:10]))
import pandas iris_df = pandas.read_csv("Iris.csv") from sklearn_pandas import DataFrameMapper from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest from sklearn.preprocessing import Imputer from sklearn.linear_model import LogisticRegression from sklearn2pmml.decoration import ContinuousDomain from sklearn2pmml.pipeline import PMMLPipeline pipeline = PMMLPipeline([ ("mapper", DataFrameMapper([ (["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"], [ContinuousDomain(), Imputer()]) ])), ("pca", PCA(n_components=3)), ("selector", SelectKBest(k=2)), ("classifier", LogisticRegression()) ]) pipeline.fit(iris_df, iris_df["Species"]) from sklearn2pmml import sklearn2pmml sklearn2pmml(pipeline, "LogisticRegressionIris.pmml", with_repr=True)
# raw_data=raw_data.sample(frac=0.03) # 将非数值型的数据转换为数值型数据 # print("Transforming data...") raw_data[last_column_index], attacks = pd.factorize( raw_data[last_column_index], sort=True) # 对原始数据进行切片,分离出特征和标签,第1~41列是特征,第42列是标签 features = raw_data.iloc[:, :raw_data.shape[1] - 1] # pandas中的iloc切片是完全基于位置的索引 labels = raw_data.iloc[:, raw_data.shape[1] - 1:] # 数据标准化 # features = preprocessing.scale(features) # features = pd.DataFrame(features) # 将多维的标签转为一维的数组 labels = labels.values.ravel() # 将数据分为训练集和测试集,并打印维数 df = pd.DataFrame(features) X_train, X_test, y_train, y_test = train_test_split(df, labels, train_size=0.8, test_size=0.2, stratify=labels) pipeline = PMMLPipeline([("classifier", DecisionTreeClassifier(criterion='entropy', max_depth=12, min_samples_leaf=1, splitter="best"))]) pipeline.fit(X_train, y_train) sklearn2pmml(pipeline, "data/pmml/DecisionTreeIris.pmml", with_repr=True)
FILE_PREFIX, train_transaction)) identity_test_df = pd.read_csv("{}/{}".format(FILE_PREFIX, test_identity)) X_final = pd.read_csv("{}/{}".format(FILE_PREFIX, test_transaction)) print("===============finish file loading=================") Y_train = transaction_train_df_raw['isFraud'] X_train = transaction_train_df_raw.drop('isFraud', axis=1) # 506691 preprocessor = PreProcessor(transaction_train_df_raw, identity_train_df) X_train_after_processing = preprocessor.preprocess() model = RandomForestClassifier(n_estimators=100, random_state=0) #my_pipeline = Pipeline(steps=[('preprocessor', preprocessor_pipeline), ('model', model)]) # Preprocessing of training data, fit model # my_pipeline.fit(X_train, Y_train) #my_pipeline.fit(X_train, Y_train) from sklearn2pmml.pipeline import PMMLPipeline pipeline = PMMLPipeline([ #("preprocessing", dataFrameMapper), ("classifier", model) ]) pipeline.fit(X_train_after_processing, Y_train) from sklearn2pmml import sklearn2pmml sklearn2pmml(pipeline, "model.pmml", with_repr=True) # Preprocessing of validation data, get predictions #preds = my_pipeline.predict(X_test)
import pandas as pd from sklearn.tree import DecisionTreeClassifier from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml import sklearn2pmml from sklearn.datasets import load_iris from sklearn_pandas import DataFrameMapper iris = load_iris() iris_df = pd.DataFrame(iris.data, columns=iris.feature_names) iris_df['Species'] = iris.target pipeline = PMMLPipeline([("classifier", DecisionTreeClassifier())]) print(pipeline) pipeline.fit(iris_df[iris_df.columns.difference(["Species"])], iris_df["Species"]) sklearn2pmml(pipeline, "DecisionTreeIris.pmml", with_repr=True) # 用Mapper定义特征工程 mapper = DataFrameMapper([ (['sbp'], MinMaxScaler()), (['tobacco'], MinMaxScaler()), ('ldl', None), ('adiposity', None), (['famhist'], LabelBinarizer()), ('typea', None), ('obesity', None), ('alcohol', None), (['age'], FunctionTransformer(np.log)), ])