def xgboost_to_pmml(data_X, data_y, par_file, save_model_as): """Save Xgboost Model to PMMl file. Parameters ---------- data_X : pandas.DataFrame Variables of train data. date_y : pandas.DataFrame Lables of train data. par_file : str File path of model's parameters. save_model_as : str File path of PMML. Returns ------- None Generate PMML file locally as `save_model_as` given. Examples -------- >>> xgboost_to_pmml(data_x, data_y, "par.json", "model.pmml") """ # Create Xgboost Model with open(par_file, "r") as f: par = json.load(f) xgb_now = XGBClassifier(**par) # Create Pipeline pipeline = PMMLPipeline([("classifier", xgb_now)]) # Fit Model pipeline.fit(data_X, data_y) # Save Model sklearn2pmml(pipeline, save_model_as, with_repr=True)
def build_audit(classifier, name, with_proba=True): mapper = DataFrameMapper([ ("Age", ContinuousDomain()), ("Employment", [ LabelBinarizer(), SelectFromModel(EstimatorProxy( DecisionTreeClassifier(random_state=13)), threshold="1.25 * mean") ]), ("Education", [ LabelBinarizer(), SelectorProxy( SelectFromModel(EstimatorProxy( RandomForestClassifier(random_state=13, n_estimators=3)), threshold="median")) ]), ("Marital", [LabelBinarizer(), SelectKBest(k=3)]), ("Occupation", [LabelBinarizer(), SelectorProxy(SelectKBest(k=3))]), ("Income", ContinuousDomain()), ("Gender", LabelEncoder()), ("Deductions", LabelEncoder()), ("Hours", ContinuousDomain()) ]) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_X, audit_y) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"]) if (with_proba == True): adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns=["probability_0", "probability_1"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
def train_and_save_model(data, model_path): """ 利用sklearn2pmml将模型存储为PMML """ model = PMMLPipeline([("regressor", linear_model.LinearRegression())]) model.fit(data[["x"]], data["y"]) sklearn2pmml(model, model_path)
def getFirstContent(dataUrl, modelUrl, modelName): training_data = load_files(dataUrl, encoding="utf-8") ''' 这是开始提取特征,这里的特征是词频统计。 ''' count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(training_data.data) ''' 这是开始提取特征,这里的特征是TFIDF特征。 ''' tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) ''' 使用朴素贝叶斯分类,并做出简单的预测 ''' mnb_pipeline = PMMLPipeline([("classifier", LogisticRegression())]) mnb_pipeline.fit(X_train_tfidf, training_data.target) //保存为pkl格式 joblib.dump(mnb_pipeline, modelUrl + modelName) //保存为pmml格式 sklearn2pmml(mnb_pipeline, modelUrl + modelName, with_repr = True) if (os.path.exists(modelUrl + modelName)): return "success" else: return "fail"
def build_sentiment(classifier, name, with_proba=True): pipeline = PMMLPipeline([ ("tf-idf", TfidfVectorizer( analyzer="word", preprocessor=None, strip_accents=None, lowercase=True, token_pattern=None, tokenizer=Splitter(), stop_words="english", ngram_range=(1, 2), norm=None, dtype=(numpy.float32 if isinstance( classifier, RandomForestClassifier) else numpy.float64))), ("selector", SelectorProxy(SelectPercentile(chi2, percentile=10))), ("classifier", classifier) ]) pipeline.fit(sentiment_X, sentiment_y) store_pkl(pipeline, name + ".pkl") score = DataFrame(pipeline.predict(sentiment_X), columns=["Score"]) if (with_proba == True): score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns=["probability(0)", "probability(1)"]) score = pandas.concat((score, score_proba), axis=1) store_csv(score, name + ".csv")
def build_iris(classifier, name, with_proba=True): pipeline = PMMLPipeline([ ("union", FeatureUnion([("normal_scale", DataFrameMapper([ (iris_X.columns.values, ContinuousDomain()), ])), ("log_scale", DataFrameMapper([(iris_X.columns.values, FunctionTransformer(numpy.log10))])) ])), ("scaler", RobustScaler()), ("pca", IncrementalPCA(n_components=3, whiten=True)), ("classifier", classifier) ]) pipeline.fit(iris_X, iris_y) store_pkl(pipeline, name + ".pkl") species = DataFrame(pipeline.predict(iris_X), columns=["Species"]) if (with_proba == True): species_proba = DataFrame(pipeline.predict_proba(iris_X), columns=[ "probability(setosa)", "probability(versicolor)", "probability(virginica)" ]) species = pandas.concat((species, species_proba), axis=1) store_csv(species, name + ".csv")
def build_audit(classifier, name, with_proba = True, **kwargs): continuous_mapper = DataFrameMapper([ ("Age", ContinuousDomain()), ("Income", ContinuousDomain()), ("Hours", ContinuousDomain()) ]) categorical_mapper = DataFrameMapper([ ("Employment", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(DecisionTreeClassifier(random_state = 13))))]), ("Education", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(RandomForestClassifier(random_state = 13, n_estimators = 3)), threshold = "1.25 * mean"))]), ("Marital", [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]), ("Occupation", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectKBest(k = 3))]), ("Gender", [CategoricalDomain(), LabelBinarizer(neg_label = -3, pos_label = 3)]), ("Deductions", [CategoricalDomain(), LabelEncoder()]), ]) pipeline = PMMLPipeline([ ("union", FeatureUnion([ ("continuous", continuous_mapper), ("categorical", Pipeline([ ("mapper", categorical_mapper), ("polynomial", PolynomialFeatures()) ])) ])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y) customize(classifier, **kwargs) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"]) if(with_proba == True): adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name + ".csv")
def build_housing(regressor, name, with_kneighbors=False): mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())]) pipeline = PMMLPipeline([ ("mapper", mapper), ("transformer-pipeline", Pipeline([ ("polynomial", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)), ("scaler", StandardScaler()), ("selector", SelectorProxy( SelectPercentile(score_func=f_regression, percentile=35))), ])), ("regressor", regressor) ]) pipeline.fit(housing_X, housing_y) store_pkl(pipeline, name + ".pkl") medv = DataFrame(pipeline.predict(housing_X), columns=["MEDV"]) if (with_kneighbors == True): Xt = pipeline_transform(pipeline, housing_X) kneighbors = regressor.kneighbors(Xt) medv_ids = DataFrame(kneighbors[1] + 1, columns=[ "neighbor(" + str(x + 1) + ")" for x in range(regressor.n_neighbors) ]) medv = pandas.concat((medv, medv_ids), axis=1) store_csv(medv, name + ".csv")
def save_as_PMML(data, modelPath): """ 利用sklearn2pmml将模型存储为PMML """ model = PMMLPipeline([("regressor", linear_model.LinearRegression())]) model.fit(data[["x"]], data["y"]) sklearn2pmml(model, "linear.pmml", with_repr=True)
def train(data_conf, model_conf, **kwargs): """Python train method called by AOA framework Parameters: data_conf (dict): The dataset metadata model_conf (dict): The model configuration to use Returns: None:No return """ # load data & engineer iris_df = pd.read_csv(data_conf['location']) train, _ = train_test_split(iris_df, test_size=0.5, random_state=42) X = train.drop("species", 1) y = train['species'] print("Starting training...") # fit model to training data classifier = PMMLPipeline([('classifier', RandomForestClassifier())]) classifier.fit(X, y.values.ravel()) print("Finished training") # export model artefacts to models/ folder if not os.path.exists('models'): os.makedirs('models') sklearn2pmml(classifier, "models/model.pmml") print("Saved trained model")
def pmml(x, Y): from sklearn2pmml import PMMLPipeline, sklearn2pmml LR_pipeline = PMMLPipeline([ ("classifier", LogisticRegression()) ]) # 训练模型 LR_pipeline.fit(x, Y) sklearn2pmml(LR_pipeline, "LogisticRegression.pmml")
def build_audit_dict(classifier, name, with_proba=True): pipeline = PMMLPipeline([("dict-transformer", DictVectorizer()), ("classifier", classifier)]) pipeline.fit(audit_dict_X, audit_y) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_dict_X), columns=["Adjusted"]) if (with_proba == True): adjusted_proba = DataFrame( pipeline.predict_proba(audit_dict_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
def build_iforest_housing_anomaly(iforest, name): mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())]) pipeline = PMMLPipeline([("mapper", mapper), ("estimator", iforest)]) pipeline.fit(housing_X) store_pkl(pipeline, name + ".pkl") decisionFunction = DataFrame(pipeline.decision_function(housing_X), columns=["decisionFunction"]) outlier = DataFrame(pipeline.predict(housing_X) == -1, columns=["outlier" ]).replace(True, "true").replace(False, "false") store_csv(pandas.concat([decisionFunction, outlier], axis=1), name + ".csv")
def build_auto_na(regressor, name): mapper = DataFrameMapper( [([column], [ContinuousDomain(missing_values = None), Imputer()]) for column in ["acceleration", "displacement", "horsepower", "weight"]] + [([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year", "origin"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(auto_na_X, auto_na_y) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"]) store_csv(mpg, name + ".csv")
def test_fit(self): pipeline = PMMLPipeline([("estimator", DummyRegressor())]) self.assertFalse(hasattr(pipeline, "active_fields")) self.assertFalse(hasattr(pipeline, "target_fields")) X = DataFrame([[1, 0], [2, 0], [3, 0]], columns=["X1", "X2"]) y = Series([0.5, 1.0, 1.5], name="y") pipeline.fit(X, y) self.assertEqual(["X1", "X2"], pipeline.active_fields.tolist()) self.assertEqual("y", pipeline.target_fields.tolist()) X.columns = ["x1", "x2"] pipeline.fit(X, y) self.assertEqual(["x1", "x2"], pipeline.active_fields.tolist()) self.assertEqual("y", pipeline.target_fields.tolist())
class FirstStep(object): def __init__(self): self.__iris = load_iris() self.__X = pd.DataFrame(self.__iris.data, columns=self.__iris.feature_names) self.__y = pd.DataFrame(self.__iris.target, columns=["Species"]) self.__train = None self.__train_label = None self.__test = None self.__test_one_sample = None self.__test_label = None self.__mapper = None self.__estimator = None self.__pipeline = None def train_test_split_step(self): self.__train, self.__test, self.__train_label, self.__test_label = ( train_test_split(self.__X, self.__y, test_size=0.2)) self.__train = self.__train.reset_index(drop=True) self.__train_label = self.__train_label.reset_index(drop=True) self.__test = self.__test.reset_index(drop=True) self.__test_label = self.__train.reset_index(drop=True) def feature_engineering_step(self): self.__mapper = (DataFrameMapper([([ "sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)" ], [StandardScaler()])])) def model_train_step(self): self.__estimator = DecisionTreeClassifier() def pipeline_step(self): self.__pipeline = PMMLPipeline([("mapper", self.__mapper), ("estimator", self.__estimator)]) self.__pipeline.fit(self.__train, self.__train_label) def output_step(self): joblib.dump(self.__pipeline, "C:\\Users\\Dell\\Desktop\\pipeline.pkl.z", compress=3) def input_step(self): self.__pipeline = joblib.load( "C:\\Users\\Dell\\Desktop\\pipeline.pkl.z") self.__test_one_sample = self.__test[0:1] print(self.__pipeline.predict(self.__test)) # 传入一行记录 print(self.__pipeline.predict(self.__test_one_sample))
def build_auto(regressor, name): mapper = DataFrameMapper([ (["cylinders"], CategoricalDomain()), (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), Imputer(missing_values = "NaN"), StandardScaler()]), (["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects (["origin"], OneHotEncoder()) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name + ".csv")
class XgbModel(object): def __init__(self, train, train_label, test, test_label): self.__train = train self.__train_label = train_label self.__test = test self.__test_label = test_label self.__bst = None self.__feat_imp = None self.__test_preds = None self.__test_predictions = None self.__output = None def train(self): self.__bst = XGBClassifier(objective="binary:logistic") self.__bst = PMMLPipeline([("estimator", self.__bst)]) self.__bst.fit(self.__train, self.__train_label, estimator__eval_metric="auc") def predict(self): self.__test_preds = self.__bst.predict_proba(self.__test)[:, 1] self.__test_predictions = self.__bst.predict(self.__test) def feature_importances(self): self.__feat_imp = (pd.Series( self.__bst.feature_importances_, ["gbc", "rf", "ab", "lr"]).sort_values(ascending=False)) self.__feat_imp.plot(kind="bar", title="Feature Importances") plt.ylabel("Feature Importance Score") plt.show() def evaluate(self): print("auc : %.4f" % roc_auc_score(self.__test_label, self.__test_preds)) print("accuracy score : %.4f" % accuracy_score(self.__test_label, self.__test_predictions)) def evaluate_output(self): self.__output = np.hstack( (self.__test, self.__test_label.reshape( (-1, 1)), self.__test_preds.reshape((-1, 1)))) pd.DataFrame( self.__output).to_csv("C:\\Users\\Dell\\Desktop\\output.csv") def xgbmodel_output(self): joblib.dump(self.__bst, "C:\\Users\\Dell\\Desktop\\bstML.pkl.z", compress=True)
def build_versicolor(classifier, name, with_proba=True): mapper = DataFrameMapper([((versicolor_columns[:-1], [ContinuousDomain(), RobustScaler()]))]) pipeline = PMMLPipeline([("mapper", mapper), ("transformer", PolynomialFeatures(degree=3)), ("selector", SelectKBest(k="all")), ("classifier", classifier)]) pipeline.fit(versicolor_X, versicolor_y) store_pkl(pipeline, name + ".pkl") species = DataFrame(pipeline.predict(versicolor_X), columns=["Species"]) if (with_proba == True): species_proba = DataFrame(pipeline.predict_proba(versicolor_X), columns=["probability_0", "probability_1"]) species = pandas.concat((species, species_proba), axis=1) store_csv(species, name + ".csv")
def build_audit_na(classifier, name, with_proba = True): mapper = DataFrameMapper( [([column], [ContinuousDomain(missing_values = None), Imputer()]) for column in ["Age", "Income", "Hours"]] + [([column], [CategoricalDomain(missing_values = None), CategoricalImputer(), PMMLLabelBinarizer()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ]) pipeline.fit(audit_na_X, audit_na_y) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"]) if(with_proba == True): adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name + ".csv")
def build_svm_housing_anomaly(svm, name): mapper = DataFrameMapper([(housing_columns[:-1], ContinuousDomain())]) pipeline = PMMLPipeline([("mapper", mapper), ("estimator", Pipeline([("first", MaxAbsScaler()), ("second", svm)]))]) pipeline.fit(housing_X) store_pkl(pipeline, name + ".pkl") decisionFunction = DataFrame(pipeline.decision_function(housing_X), columns=["decisionFunction"]) outlier = DataFrame(pipeline.predict(housing_X) <= 0, columns=["outlier" ]).replace(True, "true").replace(False, "false") store_csv(pandas.concat([decisionFunction, outlier], axis=1), name + ".csv")
def build_wheat(kmeans, name, with_affinity=True): mapper = DataFrameMapper([(wheat_X.columns.values, ContinuousDomain())]) pipeline = PMMLPipeline([("mapper", mapper), ("scaler", MinMaxScaler()), ("clusterer", kmeans)]) pipeline.fit(wheat_X) store_pkl(pipeline, name + ".pkl") cluster = DataFrame(pipeline.predict(wheat_X), columns=["Cluster"]) if (with_affinity == True): Xt = pipeline_transform(pipeline, wheat_X) affinity_0 = kmeans_distance(kmeans, 0, Xt) affinity_1 = kmeans_distance(kmeans, 1, Xt) affinity_2 = kmeans_distance(kmeans, 2, Xt) cluster_affinity = DataFrame( numpy.transpose([affinity_0, affinity_1, affinity_2]), columns=["affinity(0)", "affinity(1)", "affinity(2)"]) cluster = pandas.concat((cluster, cluster_affinity), axis=1) store_csv(cluster, name + ".csv")
def model_wrapper_fit(self): self.__model_list.extend([ self.__gradient_boosting_classifier, self.__random_forest_classifier, self.__logistic_regression, self.__k_neighbors_classifier, self.__extra_tree_classifier, self.__xgb_classifier ]) for model in self.__model_list: temp = PMMLPipeline([("estimator", model)]) temp.fit(self.__train, self.__train_label) self.__pmml_model_list.append(temp) print( self.__logistic_regression.fit(self.__train, self.__train_label).coef_) print( self.__logistic_regression.fit(self.__train, self.__train_label).intercept_)
def build_iris(classifier, name, with_proba=True): mapper = DataFrameMapper([ (iris_X.columns.values, ContinuousDomain()), ]) pipeline = PMMLPipeline([("mapper", mapper), ("scaler", RobustScaler()), ("pca", IncrementalPCA(n_components=3, whiten=True)), ("classifier", classifier)]) pipeline.fit(iris_X, iris_y) store_pkl(pipeline, name + ".pkl") species = DataFrame(pipeline.predict(iris_X), columns=["Species"]) if (with_proba == True): species_proba = DataFrame(pipeline.predict_proba(iris_X), columns=[ "probability_setosa", "probability_versicolor", "probability_virginica" ]) species = pandas.concat((species, species_proba), axis=1) store_csv(species, name + ".csv")
def test_fit_verify(self): pipeline = PMMLPipeline([("estimator", DummyRegressor())]) self.assertFalse(hasattr(pipeline, "active_fields")) self.assertFalse(hasattr(pipeline, "target_fields")) X = DataFrame([[1, 0], [2, 0], [3, 0]], columns = ["X1", "X2"]) y = Series([0.5, 1.0, 1.5], name = "y") pipeline.fit(X, y) self.assertEqual(["X1", "X2"], pipeline.active_fields.tolist()) self.assertEqual("y", pipeline.target_fields.tolist()) X.columns = ["x1", "x2"] pipeline.fit(X, y) self.assertEqual(["x1", "x2"], pipeline.active_fields.tolist()) self.assertEqual("y", pipeline.target_fields.tolist()) self.assertFalse(hasattr(pipeline, "verification")) pipeline.verify(X.sample(2)) self.assertEqual(2, len(pipeline.verification.active_values)) self.assertEqual(2, len(pipeline.verification.target_values)) X.columns = ["x2", "x1"] with self.assertRaises(ValueError): pipeline.verify(X.sample(2))
def build_wheat(kmeans, name, with_affinity=True): mapper = DataFrameMapper([([ "Area", "Perimeter", "Compactness", "Kernel.Length", "Kernel.Width", "Asymmetry", "Groove.Length" ], ContinuousDomain())]) pipeline = PMMLPipeline([("mapper", mapper), ("transformer", FunctionTransformer(numpy.log10)), ("scaler", MinMaxScaler()), ("clusterer", kmeans)]) pipeline.fit(wheat_X) store_pkl(pipeline, name + ".pkl") cluster = DataFrame(pipeline.predict(wheat_X), columns=["Cluster"]) if (with_affinity == True): Xt = pipeline_transform(pipeline, wheat_X) affinity_0 = kmeans_distance(kmeans, 0, Xt) affinity_1 = kmeans_distance(kmeans, 1, Xt) affinity_2 = kmeans_distance(kmeans, 2, Xt) cluster_affinity = DataFrame( numpy.transpose([affinity_0, affinity_1, affinity_2]), columns=["affinity_0", "affinity_1", "affinity_2"]) cluster = pandas.concat((cluster, cluster_affinity), axis=1) store_csv(cluster, name + ".csv")
def build_audit_na(classifier, name, with_proba=True): employment_mapping = { "Consultant": "Private", "PSFederal": "Public", "PSLocal": "Public", "PSState": "Public", "SelfEmp": "Private", "Private": "Private" } gender_mapping = {"Female": 0, "Male": 1} mapper = DataFrameMapper( [([column], [ContinuousDomain(missing_values=None), Imputer()]) for column in ["Age", "Income", "Hours"]] + [("Employment", [ CategoricalDomain(missing_values=None), CategoricalImputer(), LookupTransformer(employment_mapping, "Other"), PMMLLabelBinarizer() ])] + [([column], [ CategoricalDomain(missing_values=None), CategoricalImputer(), PMMLLabelBinarizer() ]) for column in ["Education", "Marital", "Occupation"]] + [("Gender", [ CategoricalDomain(missing_values=None), CategoricalImputer(), LookupTransformer(gender_mapping, None) ])]) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_na_X, audit_na_y) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"]) if (with_proba == True): adjusted_proba = DataFrame( pipeline.predict_proba(audit_na_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
def save_pmml_model(model_path): ''' 用xgboost进行训练,对训练数据未进行采样,保存pmml文件供java解析 :return: ''' dataset = pd.read_csv( '/Users/looker/project/xmodel/same_product_judge/data/five_col_training_data.csv' ) # X = dataset.ix[:, dataset.columns != 'label'].values # Y = dataset.ix[:, dataset.columns == 'label'].values.flatten().astype(np.int32) X = dataset.ix[:, dataset.columns != 'label'] Y = dataset.ix[:, dataset.columns == 'label'].values.ravel() x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=33) ### fit model for train data model = XGBClassifier( learning_rate=0.1, n_estimators=1000, # 树的个数--1000棵树建立xgboost max_depth=6, # 树的深度 min_child_weight=1, # 叶子节点最小权重 gamma=0., # 惩罚项中叶子结点个数前的参数 subsample=1, # 随机选择样本建立决策树 colsample_btree=1, # 随机选取特征建立决策树 scale_pos_weight=1, # 解决样本个数不平衡的问题 random_state=27, # 随机数 objective='binary:logistic') # model.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=10, # verbose=True) # 保存为pmml格式模型 mapper = DataFrameMapper([(['f0'], None), (['f1'], None), (['f2'], None), (['f3'], None)]) pipeline = PMMLPipeline([('mapper', mapper), ("classifier", model)]) pipeline.fit(X, Y) sklearn2pmml(pipeline, model_path, with_repr=True)
import xgboost from sklearn import datasets from sklearn2pmml import sklearn2pmml from sklearn2pmml import PMMLPipeline boston = datasets.load_boston() X = boston.data y = boston.target feature_names = boston.feature_names model = xgboost.XGBRegressor(learning_rate=0.1, n_estimators=10, max_depth=10, silent=False) boston_pipeline = PMMLPipeline([("regressor", model)]) boston_pipeline.active_fields = feature_names boston_pipeline.fit(X, y) sklearn2pmml(boston_pipeline, "boston.pmml", with_repr=True, debug=True)
""" @author:duke.du @time:2018/11/29 19:19 @file:dust_pmml.py @contact: [email protected] @function:训练模型,生成PMML文件 """ import pandas as pd from sklearn.datasets import load_iris from sklearn.ensemble import RandomForestClassifier from sklearn2pmml import PMMLPipeline, sklearn2pmml # No problem iris = load_iris() # 创建带有特征名称的 DataFrame iris_df = pd.DataFrame(iris.data, columns=iris.feature_names) print("iris' size :", iris_df.shape) # 创建模型管道 iris_pipeline = PMMLPipeline([ ("classifier", RandomForestClassifier()) ]) # 训练模型 iris_pipeline.fit(iris_df, iris.target) # 导出模型到 RandomForestClassifier_Iris.pmml 文件 sklearn2pmml(iris_pipeline, "pmml/RandomForestClassifier_Iris.pmml")