'''
@Author: Runsen
@微信公众号: 润森笔记
@博客: https://blog.csdn.net/weixin_44510615
@Date: 2020/5/24
'''
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.pipeline import PMMLPipeline
pipeline = PMMLPipeline([("pca", PCA(n_components=3)), ("classifier", SVC())])
iris = load_iris()
pipeline.fit(iris.data, iris.target)
sklearn2pmml(pipeline, "iris_SVC.pmml", with_repr=True)
Exemple #2
0
def build_audit_na(classifier,
                   name,
                   with_proba=True,
                   predict_transformer=None,
                   predict_proba_transformer=None,
                   apply_transformer=None,
                   **pmml_options):
    employment_mapping = {
        "CONSULTANT": "PRIVATE",
        "PSFEDERAL": "PUBLIC",
        "PSLOCAL": "PUBLIC",
        "PSSTATE": "PUBLIC",
        "SELFEMP": "PRIVATE",
        "PRIVATE": "PRIVATE"
    }
    gender_mapping = {"FEMALE": 0.0, "MALE": 1.0, "MISSING_VALUE": 0.5}
    mapper = DataFrameMapper([(["Age"], [
        ContinuousDomain(missing_values=None, with_data=False),
        Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999"),
              name="flag_missing(Age, -999)"),
        Imputer(missing_values=-999)
    ])] + [(["Age"], MissingIndicator())] + [(["Hours"], [
        ContinuousDomain(missing_values=None, with_data=False),
        Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"),
              name="flag_missing(Hours, -999)"),
        SimpleImputer(missing_values=-999, add_indicator=True)
    ])] + [(["Income"], [
        ContinuousDomain(missing_values=None,
                         outlier_treatment="as_missing_values",
                         low_value=5000,
                         high_value=200000,
                         with_data=False),
        SimpleImputer(strategy="median", add_indicator=True)
    ])] + [(["Employment"], [
        CategoricalDomain(missing_values=None, with_data=False),
        CategoricalImputer(missing_values=None),
        StringNormalizer(function="uppercase"),
        LookupTransformer(employment_mapping, "OTHER"),
        StringNormalizer(function="lowercase"),
        PMMLLabelBinarizer()
    ])] + [([column], [
        CategoricalDomain(missing_values=None,
                          missing_value_replacement="N/A",
                          with_data=False),
        SimpleImputer(missing_values="N/A", strategy="most_frequent"),
        StringNormalizer(function="lowercase"),
        PMMLLabelBinarizer()
    ]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [
        CategoricalDomain(missing_values=None, with_data=False),
        SimpleImputer(strategy="constant"),
        StringNormalizer(function="uppercase"),
        LookupTransformer(gender_mapping, None)
    ])])
    pipeline = PMMLPipeline(
        [("mapper", mapper), ("classifier", classifier)],
        predict_transformer=predict_transformer,
        predict_proba_transformer=predict_proba_transformer,
        apply_transformer=apply_transformer)
    pipeline.fit(audit_na_X, audit_na_y)
    pipeline.configure(**pmml_options)
    store_pkl(pipeline, name)
    adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"])
    if with_proba == True:
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_na_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    if isinstance(classifier, DecisionTreeClassifier):
        Xt = pipeline_transform(pipeline, audit_na_X)
        adjusted_apply = DataFrame(classifier.apply(Xt), columns=["nodeId"])
        adjusted = pandas.concat((adjusted, adjusted_apply), axis=1)
    store_csv(adjusted, name)
Exemple #3
0
    ('mapper',
     DataFrameMapper([('name',
                       TfidfVectorizer(norm=None,
                                       analyzer="word",
                                       max_features=500,
                                       tokenizer=Splitter())),
                      ('description',
                       TfidfVectorizer(norm=None,
                                       analyzer="word",
                                       max_features=1000,
                                       tokenizer=Splitter()))])),
    ('model',
     SVC(max_iter=10000)),  # train on TF-IDF vectors w/ Linear SVM classifier
])
print("model set done!")
pipeline.fit(X, Y)
print("model fit done!")
c = pd.read_csv(r"C:\Users\钟顺民\Desktop\handbags.csv",
                sep=',',
                encoding='ISO-8859-1').dropna().sample(n=200)

prediction = pipeline.predict(c.drop(['id'], axis=1))
t = c['id']
print("Accuracy Score ->", accuracy_score(prediction, t) * 100)
"""
Accuracy Score -> 98.0
"""
# print(accuracy_score(prediction, Test_Y) * 100)
# Test_X.to_csv(r"C:\Users\钟顺民\Desktop\test.csv")
# df.to_csv(r"C:\Users\钟顺民\Desktop\3.csv")
Exemple #4
0
class AutoBuilder:
    """'
    E2E classifier builder
    
    Builds binary classifier, including:
        - dataset EDA (optional)
        - hyperparameter tuning (optional)
        - model performance assessment
        - SHAP-based feature analysis
        - feature selection
        - creating deployment package (pmml & pkl)

    Attributes:
        auto_build (method): automatically builds bin populates output_dir path, with model artifacts, and evaluation charts

    """
    def __init__(
        self,
        output_dir_path,
        csv_path,
        target_col="target",
        ignore_cols=[],
        eda_flag=True,
        tune_flag=True,
        cardinality_threshold=100,
        shap_plot_num=10,
        shap_frac=0.05,
        importance_cutoff=0.00,
        corr_cutoff=0.9,
        search_space=LGB_SEARCH_SPACE,
        tuning_iters=25,
        lgb_params={},
        random_state=1234,
    ):
        """
        Args:
            output_dir_path (string):  filepath where outputs package is created and saved
            csv_path (string): filepath to input csv, NOTE need to preprocess columns to be numeric or string type
            target_col (string, optional): target column, default 'target'
            ignore_cols (iterable, optional): columns to be dropped, default []
            eda_flag (boolean, optional): EDA plots to be generated, default True
            tune_flag (boolean, optional): Lightgbm hyperparameters to be tuned, default True
            shap_plot_num (numeric, optional): Generate SHAP dependency plots for N most important features, default 10
            shap_frac (numeric, optional): Proportion of data sampled for SHAP analysis, default 5%
            importance_cutoff (numeric, optional): Abs. avg. SHAP value threshold suggest dropping feature, default 0.00
            corr_cutoff (numeric, optional): Abs. avg. correlation suggest dropping feature, default 0.9
            search_space (numeric, optional): Tuning space for Bayesian optimisation, default is SKOPT_SEARCH_SPACE
            tuning_iter (numeric, optional): number of tuning iterations for Bayesian optimisation, default is 25,
            lgb_params (dict, optional): Hyperparams to use in case when tune_flag = False, default None
            random_state (numeric, optional): Random seed for train test split, and model-training - default is 1234
        """
        self.output_dir_path = output_dir_path
        self.csv_path = csv_path
        self.target_col = target_col
        self.ignore_cols = ignore_cols
        self.eda_flag = eda_flag
        self.tune_flag = tune_flag
        self.cardinality_threshold = cardinality_threshold
        self.shap_plot_num = shap_plot_num
        self.shap_frac = shap_frac
        self.importance_cutoff = importance_cutoff
        self.corr_cutoff = corr_cutoff
        self.search_space = search_space
        self.tuning_iters = tuning_iters
        self.lgb_params = lgb_params
        self.random_state = random_state

    def _gen_model_dir(self):
        """
        Creates output directory according to self.output_dir_path, removing previous output if there.

        Also makes subdirectories
            /bin
            /plots
        """
        logger.info(f"building directory {self.csv_path}")
        if os.path.exists(self.output_dir_path) and os.path.isdir(
                self.output_dir_path):
            shutil.rmtree(self.output_dir_path)
        os.mkdir(self.output_dir_path)
        os.mkdir(self.output_dir_path + "/bin")
        os.mkdir(self.output_dir_path + "/plots")

    def _process_csv(self):
        """
        Parses csv specified in self.csv_path, saving to self.raw

        Also
            - Drops ignore columns
            - Validates target and feature columns
                Target = binary, 0-1
                Features = numeric or string
        """
        logger.info(f"loading file {self.csv_path}")
        raw = pd.read_csv(self.csv_path).drop(columns=self.ignore_cols)

        logger.info("checking valid input data")
        assert raw[self.target_col].isna().sum() == 0

        assert list(sorted(raw[self.target_col].unique())) == [0, 1]

        valid_shape = raw.select_dtypes(
            include=["int64", "float64", "object"]).shape
        assert valid_shape == raw.shape
        self.raw = raw
        raw.to_csv(f"{self.output_dir_path}/bin/raw.csv")

    def _prepare_X_y(self):
        """
        Splits self raw into X_train y_train, X_test, y_test 

        Also records categorical and numerical columns, and saves csv of training set
        """

        y = self.raw[self.target_col]
        X = self.raw.drop(columns=self.target_col)

        logger.info("train test split")
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.20, random_state=self.random_state)
        data_train = self.X_train.copy()
        data_train["target"] = self.y_train

        training_data_path = f"{self.output_dir_path}/bin/train.csv"
        data_train.to_csv(training_data_path, index=False)

        del X, y

    def _create_categorical_transformer(self):
        self.categorical_cols = self.X_train.select_dtypes(
            include=["object"]).columns
        self.numeric_cols = self.X_train.select_dtypes(
            include=["int64", "float64"]).columns

        self.mapper = DataFrameMapper(
            [([cat_column],
              [CategoricalDomain(), LabelEncoder()])
             for cat_column in self.categorical_cols] +
            [(self.numeric_cols, ContinuousDomain())])

        # hacky, also storing seperated X_train_encoded and classifier, because couldn't get SHAP and skopt to work for e2e pipeline
        self.X_train_encoded = self.mapper.fit_transform(self.X_train)
        self.var_names = self.X_train.columns

    def _tune(self):
        """
        Explores tuning space, updating self.lgb_params with values that minimize cross-validated brier score
        """
        # todo, can I save memory, code and possibly tune binning strats by passing unencoded X_train into pipeline?
        logger.info(f"tuning {self.tuning_iters}")
        results = utils.bayes_hyperparam_tune(
            model=lgb.LGBMClassifier(objective="binary"),
            X=self.X_train_encoded,
            y=self.y_train,
            search_space=self.search_space,
            n_iters=self.tuning_iters,
        )
        self.lgb_params = results.best_params_
        logger.info(f"best params {self.lgb_params}")

    def _save_model(self):
        """
        Saves sklearn pipeline as pkl and pmml files, also saves training file

        Args:
            pipeline (lightgbm pipeline) model to be saved
            output_dir (string): path to save model outputs
            train (df): dataset to save
        """
        pmml_path = f"{self.output_dir_path}/model-pmml.pmml"
        pkl_path = f"{self.output_dir_path}/model-bin.pkl"
        pickle.dump(self.pipeline, open(pkl_path, "wb"))
        # sklearn2pmml(self.pipeline, pmml_path)

    def _generate_shap_plots(self):
        classifier = lgb.LGBMClassifier(**self.lgb_params)
        classifier.fit(self.X_train_encoded, self.y_train)
        X_shap = pd.DataFrame(data=self.X_train_encoded,
                              columns=self.var_names)
        self.feature_importance = utils.create_shap_plots(
            classifier,
            X_shap,
            output_dir=self.output_dir_path,
            N=self.shap_plot_num,
            frac=self.shap_frac,
        )

    def auto_build(self):
        """
        Populates output_dir path, with model artifacts, and evalution charts
        """
        self._gen_model_dir()

        self._process_csv()

        self._prepare_X_y()

        if self.eda_flag:
            logger.info("EDA")
            utils.dataset_eda(data=self.X_train,
                              output_dir=self.output_dir_path)

        self._create_categorical_transformer()

        if self.tune_flag:
            self._tune()

        self._generate_shap_plots()

        logger.info("creating pipeline")
        classifier = lgb.LGBMClassifier(**self.lgb_params)
        self.pipeline = PMMLPipeline([("mapper", self.mapper),
                                      ("classifier", classifier)])

        self.pipeline.fit(self.X_train, self.y_train)

        logger.info("Assessing model")

        y_pred = self.pipeline.predict_proba(self.X_test)[:, 1]
        y_bm = np.repeat(self.y_train.mean(), self.y_test.shape)
        utils.evaluate_model(self.y_test, y_pred, y_bm, self.output_dir_path,
                             "Model")

        logger.info("suggeting features to remove")
        self.cols_to_remove = utils.find_features_to_remove(
            importance=self.feature_importance,
            X=self.X_train,
            importance_cutoff=self.importance_cutoff,
            corr_threshold=self.corr_cutoff,
        )
        logger.info(f"candidates to remove - {self.cols_to_remove}")

        logger.info(f"saving model \n{self.output_dir_path}")

        self._save_model()
        test_input = dict(self.X_test.iloc[0])
        test_score = self.pipeline.predict_proba(self.X_test.head(1))
        logger.info(
            f"test-case model inputs \n{ test_input } \n model score \n {test_score}"
        )

        logger.info("done!")
Exemple #5
0
                                  DecisionTreeClassifier(random_state=13)),
                                 ("nb", GaussianNB()),
                                 ("lr", LogisticRegression())]),
               "VotingEnsembleIris",
               with_proba=False)
    build_iris(OptimalXGBClassifier(objective="multi:softprob", ntree_limit=7),
               "XGBIris",
               ntree_limit=7)

if "Iris" in datasets:
    classifier = RuleSetClassifier(
        [("X['Petal.Length'] >= 2.45 and X['Petal.Width'] < 1.75",
          "versicolor"), ("X['Petal.Length'] >= 2.45", "virginica")],
        default_score="setosa")
    pipeline = PMMLPipeline([("classifier", classifier)])
    pipeline.fit(iris_X, iris_y)
    pipeline.verify(iris_X.sample(frac=0.10, random_state=13))
    store_pkl(pipeline, "RuleSetIris")
    species = DataFrame(pipeline.predict(iris_X), columns=["Species"])
    store_csv(species, "RuleSetIris")

#
# Text classification
#

sentiment_X, sentiment_y = load_sentiment("Sentiment")


def build_sentiment(classifier, name, with_proba=True, **pmml_options):
    pipeline = PMMLPipeline([
        ("tf-idf",
Exemple #6
0
class url(object):
    def __init__(self):
        #读取数据
        good_query_list = self.get_query_list('goodqueries.txt')
        bad_query_list = self.get_query_list('badqueries.txt')

        #给黑、白数据分别打标签
        good_y = [0 for i in range(0, len(good_query_list))]
        bad_y = [1 for i in range(0, len(bad_query_list))]

        queries = good_query_list + bad_query_list
        y = good_y + bad_y

        #将原始文本数据分割转化成向量
        self.vectorizer = TfidfVectorizer(tokenizer=self.get_ngrams)

        #把文本字符串转化成([i,j],Tfidf值)矩阵X
        X = self.vectorizer.fit_transform(queries)

        #分割训练数据(建立模型)和测试数据(测试模型准确度)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=20,
                                                            random_state=42)

        #定义模型训练方法(逻辑回归)
        self.lgs = PMMLPipeline([('LogisticModer',
                                  LogisticRegression(solver='liblinear'))])

        #训练模型
        self.lgs.fit(X_train, y_train)

        #测试模型准确度
        print('模型准确度:{}'.format(self.lgs.score(X_test, y_test)))

        sklearn2pmml(self.lgs, '.\lgs.pmml', with_repr=True)

    #获取文本中的请求列表
    def get_query_list(self, filename):
        directory = str(os.getcwd()) + '\\data\\train'
        filepath = directory + '\\' + filename
        data = open(filepath, 'r', encoding='utf-8').readlines()
        query_list = []
        for d in data:
            d = str(urllib.parse.unquote(d))
            query_list.append(d)
        return list(set(query_list))

    def test_query_list(self, filename):
        directory = str(os.getcwd()) + '\\data\\test'
        filepath = directory + '\\' + filename
        data = open(filepath, 'r', encoding='utf-8').readlines()
        query_list = []
        for d in data:
            d = str(urllib.parse.unquote(d))
            query_list.append(d)
        return list(set(query_list))

    #分割字符串,每3个字符作一次分割
    def get_ngrams(self, query):
        tempQuery = str(query)
        ngrams = []
        for i in range(0, len(tempQuery) - 3):
            ngrams.append(tempQuery[i:i + 3])
        return ngrams

    #预测新的url
    def predict(self, newQueries):
        newQueries = [urllib.parse.unquote(url) for url in newQueries]
        X_predict = self.vectorizer.transform(newQueries)
        res = self.lgs.predict(X_predict)
        res_list = []
        for q, r in zip(newQueries, res):
            tmp = '正常请求' if r == 0 else '恶意请求'
            q_entity = html.escape(q)
            res_list.append({'url': q_entity, 'res': tmp})
        print("预测的结果列表:{}".format(str(res_list)))
        return res_list
Exemple #7
0
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.decoration import ContinuousDomain, CategoricalDomain
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.feature_extraction.text import Splitter
from sklearn_pandas import DataFrameMapper

data = pd.read_csv("test/support/mpg.csv")

numeric_features = ["displ", "year", "cyl"]
categorical_features = ["class"]
text_features = []

mapper = DataFrameMapper(
    [(numeric_features, [ContinuousDomain()])] +
    [([f], [CategoricalDomain(), OneHotEncoder()])
     for f in categorical_features] +
    [(f, [CategoricalDomain(),
          CountVectorizer(tokenizer=Splitter())]) for f in text_features])

pipeline = PMMLPipeline([("mapper", mapper), ("model", GaussianNB())])
pipeline.fit(data, data["drv"])

sklearn2pmml(pipeline, "test/support/python/naive_bayes.pmml")

print(pipeline.predict(data[:10]))
Exemple #8
0
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.decoration import ContinuousDomain, CategoricalDomain
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.feature_extraction.text import Splitter
from sklearn_pandas import DataFrameMapper

data = pd.read_csv("test/support/mpg.csv")

numeric_features = ["displ", "year", "cyl"]
categorical_features = ["drv", "class"]
text_features = ["model"]

mapper = DataFrameMapper(
    [(numeric_features, [ContinuousDomain()])] +
    [([f], [CategoricalDomain(), OneHotEncoder()])
     for f in categorical_features] + [(f, [
         CategoricalDomain(),
         CountVectorizer(tokenizer=Splitter(), max_features=5)
     ]) for f in text_features])

pipeline = PMMLPipeline([("mapper", mapper), ("model", LinearRegression())])
pipeline.fit(data, data["hwy"])

sklearn2pmml(pipeline, "test/support/python/linear_regression_text.pmml")

print(list(pipeline.predict(data[:10])))
Exemple #9
0
def build_audit_na(classifier, name, with_proba=True, **kwargs):
    employment_mapping = {
        "CONSULTANT": "PRIVATE",
        "PSFEDERAL": "PUBLIC",
        "PSLOCAL": "PUBLIC",
        "PSSTATE": "PUBLIC",
        "SELFEMP": "PRIVATE",
        "PRIVATE": "PRIVATE"
    }
    gender_mapping = {"FEMALE": 0, "MALE": 1}
    mapper = DataFrameMapper([(["Age"], [
        ContinuousDomain(missing_values=None, with_data=False),
        Alias(ExpressionTransformer(
            "numpy.where(pandas.notnull(X[:, 0]), X[:, 0], -999)"),
              name="flag_missing(Age, -999)"),
        Imputer(missing_values=-999)
    ])] + [(["Hours"], [
        ContinuousDomain(missing_values=None, with_data=False),
        Alias(ExpressionTransformer(
            "numpy.where(pandas.isnull(X[:, 0]), -999, X[:, 0])"),
              name="flag_missing(Hours, -999)"),
        Imputer(missing_values=-999)
    ])] + [(["Income"], [
        ContinuousDomain(missing_values=None,
                         outlier_treatment="as_missing_values",
                         low_value=5000,
                         high_value=200000,
                         with_data=False),
        Imputer()
    ])] + [(["Employment"], [
        CategoricalDomain(missing_values=None, with_data=False),
        CategoricalImputer(),
        StringNormalizer(function="uppercase"),
        LookupTransformer(employment_mapping, "OTHER"),
        StringNormalizer(function="lowercase"),
        PMMLLabelBinarizer()
    ])] + [([column], [
        CategoricalDomain(missing_values=None, with_data=False),
        CategoricalImputer(missing_values=None),
        StringNormalizer(function="lowercase"),
        PMMLLabelBinarizer()
    ]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [
        CategoricalDomain(missing_values=None, with_data=False),
        CategoricalImputer(),
        StringNormalizer(function="uppercase"),
        LookupTransformer(gender_mapping, None)
    ])])
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_na_X, audit_na_y)
    customize(classifier, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"])
    if with_proba == True:
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_na_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    if isinstance(classifier, DecisionTreeClassifier):
        Xt = pipeline_transform(pipeline, audit_na_X)
        adjusted_apply = DataFrame(classifier.apply(Xt), columns=["nodeId"])
        adjusted = pandas.concat((adjusted, adjusted_apply), axis=1)
    store_csv(adjusted, name + ".csv")
Exemple #10
0
try:
    data = pd.read_csv(csv_url, sep=";")
except Exception as e:
    logger.exception("Unable to download training & test CSV, "
                     f"check your internet connection. Error: {e}")
    exit(1)

train, test = train_test_split(data)

train_x = train.drop(["quality"], axis=1)
test_x = test.drop(["quality"], axis=1)
train_y = train[["quality"]]
test_y = test[["quality"]]

lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
lr.pmml_name_ = f"PMML-ElasticnetWineModel-{model_id}"
pipeline = PMMLPipeline(steps=[("elastic_net", lr)])

pipeline.fit(train_x, train_y)
predicted_qualities = pipeline.predict(test_x)

(rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

print("Elasticnet model (alpha={}, l1_ratio={}):".format(alpha, l1_ratio))
print("  RMSE: {}".format(rmse))
print("  MAE: {}".format(mae))
print("  R2: {}".format(r2))

sklearn2pmml(pipeline, output_path, with_repr=True)
print(f"Elasticnet model (alpha={alpha}, l1_ratio={l1_ratio}) exported")
Exemple #11
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import tree
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml import sklearn2pmml
from sklearn.datasets import load_iris
import os

iris = load_iris()
data = iris.data
target = iris.target

# os.environ["PATH"] += os.pathsep + 'C:/Program Files/Java/jdk1.8.0_171/bin'
# X=[[1,2,3,1],[2,4,1,5],[7,8,3,6],[4,8,4,7],[2,5,6,9]]
# y=[0,1,0,2,1]

pipeline = PMMLPipeline([("classifier",
                          tree.DecisionTreeClassifier(random_state=9))])
pipeline.fit(data, target)
sklearn2pmml(pipeline, "tree_result.pmml")
labels = labels.values.ravel()

# 将数据分为训练集和测试集,并打印维数
df = pd.DataFrame(features)
X_train, X_test, y_train, y_test = train_test_split(df, labels, train_size=0.8, test_size=0.2)


# n_estimators:森林中树的个数;n_jobs:并行作业的数量,为-1时是处理器的核数; random_state:随机种子
print("Training model...")
clf = RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=100)
trained_model = clf.fit(X_train, y_train)
print("Score:", trained_model.score(X_train, y_train))


model = PMMLPipeline([('RandomForest', RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=100))])
model.fit(X_train,y_train)
sklearn2pmml(model, './save_model/RandomForest.pmml', with_repr=True)

# predicting
print("Predicting...")
y_pred = clf.predict(X_test)

print("Computing performance metrics...")
results = confusion_matrix(y_test, y_pred)
error = zero_one_loss(y_test, y_pred)

# 根据混淆矩阵求预测精度
list_diag = np.diag(results)
list_raw_sum = np.sum(results, axis=1)
print("Predict accuracy: ", np.mean(list_diag) / np.mean(list_raw_sum))
Exemple #13
0
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml import sklearn2pmml

import os
# os.environ["PATH"] += os.pathsep + 'C:/Program Files/Java/jdk1.8.0_171/bin'

X = [[1, 2, 3, 1], [2, 4, 1, 5], [7, 8, 3, 6], [4, 8, 4, 7], [2, 5, 6, 9]]
y = [0, 1, 0, 2, 1]
pipeline = PMMLPipeline([("classifier",
                          tree.DecisionTreeClassifier(random_state=9))])
pipeline.fit(X, y)

# pip install --user --upgrade git+https://github.com/jpmml/sklearn2pmml.git
sklearn2pmml(pipeline, "demo.pmml", with_repr=True)

from sklearn.externals import joblib
joblib.dump(pipeline, "pipeline.pkl.z", compress=9)

# java -jar target/jpmml-sklearn-executable-1.5-SNAPSHOT.jar --pkl-input pipeline.pkl.z --pmml-output pipeline.pmml
Exemple #14
0
from lightgbm import LGBMRegressor
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.decoration import ContinuousDomain, CategoricalDomain
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.preprocessing import PMMLLabelEncoder
from sklearn2pmml.feature_extraction.text import Splitter
from sklearn_pandas import DataFrameMapper

data = pd.read_csv("test/support/mpg.csv")

numeric_features = ["displ", "year", "cyl"]
categorical_features = ["drv", "class"]
text_features = ["model"]

mapper = DataFrameMapper(
  [(numeric_features, [ContinuousDomain()])] +
  [([f], [CategoricalDomain(), PMMLLabelEncoder()]) for f in categorical_features] +
  [(f, [CategoricalDomain(), CountVectorizer(tokenizer=Splitter(), max_features=5)]) for f in text_features]
)

pipeline = PMMLPipeline([
  ("mapper", mapper),
  ("model", LGBMRegressor(n_estimators=1000))
])
# use model__sample_weight for weight
pipeline.fit(data, data["hwy"], model__categorical_feature=[3, 4])

sklearn2pmml(pipeline, "test/support/python/lightgbm_regression.pmml")

print(pipeline.predict(data[:10]))
Exemple #15
0
import pandas

iris_df = pandas.read_csv("Iris.csv")

from sklearn_pandas import DataFrameMapper
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LogisticRegression
from sklearn2pmml.decoration import ContinuousDomain
from sklearn2pmml.pipeline import PMMLPipeline

pipeline = PMMLPipeline([
    ("mapper",
     DataFrameMapper([
         (["Sepal.Length", "Sepal.Width", "Petal.Length",
           "Petal.Width"], [ContinuousDomain(), Imputer()])
     ])), ("pca", PCA(n_components=3)), ("selector", SelectKBest(k=2)),
    ("classifier", LogisticRegression())
])
pipeline.fit(iris_df, iris_df["Species"])

from sklearn2pmml import sklearn2pmml

sklearn2pmml(pipeline, "LogisticRegressionIris.pmml", with_repr=True)
# raw_data=raw_data.sample(frac=0.03)

# 将非数值型的数据转换为数值型数据
# print("Transforming data...")
raw_data[last_column_index], attacks = pd.factorize(
    raw_data[last_column_index], sort=True)
# 对原始数据进行切片,分离出特征和标签,第1~41列是特征,第42列是标签
features = raw_data.iloc[:, :raw_data.shape[1] - 1]  # pandas中的iloc切片是完全基于位置的索引
labels = raw_data.iloc[:, raw_data.shape[1] - 1:]
# 数据标准化
# features = preprocessing.scale(features)
# features = pd.DataFrame(features)
# 将多维的标签转为一维的数组
labels = labels.values.ravel()

# 将数据分为训练集和测试集,并打印维数
df = pd.DataFrame(features)
X_train, X_test, y_train, y_test = train_test_split(df,
                                                    labels,
                                                    train_size=0.8,
                                                    test_size=0.2,
                                                    stratify=labels)

pipeline = PMMLPipeline([("classifier",
                          DecisionTreeClassifier(criterion='entropy',
                                                 max_depth=12,
                                                 min_samples_leaf=1,
                                                 splitter="best"))])
pipeline.fit(X_train, y_train)
sklearn2pmml(pipeline, "data/pmml/DecisionTreeIris.pmml", with_repr=True)
Exemple #17
0
        FILE_PREFIX, train_transaction))

    identity_test_df = pd.read_csv("{}/{}".format(FILE_PREFIX, test_identity))
    X_final = pd.read_csv("{}/{}".format(FILE_PREFIX, test_transaction))
    print("===============finish file loading=================")
    Y_train = transaction_train_df_raw['isFraud']
    X_train = transaction_train_df_raw.drop('isFraud', axis=1)  # 506691

    preprocessor = PreProcessor(transaction_train_df_raw, identity_train_df)
    X_train_after_processing = preprocessor.preprocess()
    model = RandomForestClassifier(n_estimators=100, random_state=0)
    #my_pipeline = Pipeline(steps=[('preprocessor', preprocessor_pipeline), ('model', model)])

    # Preprocessing of training data, fit model
    # my_pipeline.fit(X_train, Y_train)
    #my_pipeline.fit(X_train, Y_train)

    from sklearn2pmml.pipeline import PMMLPipeline

    pipeline = PMMLPipeline([
        #("preprocessing", dataFrameMapper),
        ("classifier", model)
    ])
    pipeline.fit(X_train_after_processing, Y_train)

    from sklearn2pmml import sklearn2pmml

    sklearn2pmml(pipeline, "model.pmml", with_repr=True)

    # Preprocessing of validation data, get predictions
    #preds = my_pipeline.predict(X_test)
Exemple #18
0
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml import sklearn2pmml
from sklearn.datasets import load_iris
from sklearn_pandas import DataFrameMapper

iris = load_iris()

iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['Species'] = iris.target

pipeline = PMMLPipeline([("classifier", DecisionTreeClassifier())])
print(pipeline)

pipeline.fit(iris_df[iris_df.columns.difference(["Species"])],
             iris_df["Species"])
sklearn2pmml(pipeline, "DecisionTreeIris.pmml", with_repr=True)

# 用Mapper定义特征工程
mapper = DataFrameMapper([
    (['sbp'], MinMaxScaler()),
    (['tobacco'], MinMaxScaler()),
    ('ldl', None),
    ('adiposity', None),
    (['famhist'], LabelBinarizer()),
    ('typea', None),
    ('obesity', None),
    ('alcohol', None),
    (['age'], FunctionTransformer(np.log)),
])