コード例 #1
0
 def test_transform_float(self):
     y = [1.0, float("NaN"), 2.0, 3.0]
     binarizer = PMMLLabelBinarizer()
     binarizer.fit(y)
     self.assertEqual([[1, 0, 0], [0, 0, 1], [0, 0, 0], [0, 1, 0]],
                      binarizer.transform([1.0, 3.0,
                                           float("NaN"), 2.0]).tolist())
コード例 #2
0
ファイル: __init__.py プロジェクト: xuming1986/sklearn2pmml
	def test_transform_float(self):
		X = [1.0, float("NaN"), 2.0, 3.0]
		dense_binarizer = PMMLLabelBinarizer()
		dense_binarizer.fit(X)
		Xt_dense = dense_binarizer.transform([1.0, 3.0, float("NaN"), 2.0])
		self.assertIsInstance(Xt_dense, numpy.ndarray)
		self.assertEqual([[1, 0, 0], [0, 0, 1], [0, 0, 0], [0, 1, 0]], Xt_dense.tolist())
		sparse_binarizer = PMMLLabelBinarizer(sparse_output = True)
		sparse_binarizer.fit(X)
		Xt_sparse = sparse_binarizer.transform([1.0, 3.0, float("NaN"), 2.0])
		self.assertIsInstance(Xt_sparse, scipy.sparse.csr_matrix)
		self.assertEqual(Xt_dense.tolist(), Xt_sparse.toarray().tolist())
コード例 #3
0
ファイル: xgboost.py プロジェクト: wdtzliu/sklearn2pmml
def make_xgboost_column_transformer(dtypes, missing_value_aware=True):
    """Construct a ColumnTransformer for feeding complex data into an XGBModel.

	Parameters
	----------

	dtypes: iterable of tuples (column, dtype)

	missing_value_aware: boolean
		If true, use missing value aware transformers.

	Returns
	-------
	ColumnTransformer

	"""
    transformers = list()
    for column, dtype in dtypes.items():
        if _is_categorical(dtype):
            transformers.append(
                (column, PMMLLabelBinarizer(sparse_output=True)
                 if missing_value_aware else Pipeline([("ordinal_encoder",
                                                        OrdinalEncoder()),
                                                       ("one_hot_encoder",
                                                        OneHotEncoder())]),
                 [column]))
        else:
            transformers.append((column, "passthrough", [column]))
    return ColumnTransformer(transformers, remainder="drop")
コード例 #4
0
ファイル: xgboost.py プロジェクト: wdtzliu/sklearn2pmml
def make_xgboost_dataframe_mapper(dtypes, missing_value_aware=True):
    """Construct a DataFrameMapper for feeding complex data into an XGBModel.

	Parameters
	----------

	dtypes: iterable of tuples (column, dtype)

	missing_value_aware: boolean
		If true, use missing value aware transformers.

	Returns
	-------
	DataFrameMapper

	"""
    features = list()
    for column, dtype in dtypes.items():
        if _is_categorical(dtype):
            features.append(([column], PMMLLabelBinarizer(
                sparse_output=True) if missing_value_aware else LabelBinarizer(
                    sparse_output=True)))
        else:
            features.append(([column], None))
    return DataFrameMapper(features)
コード例 #5
0
def build_auto_na(regressor,
                  name,
                  predict_transformer=None,
                  apply_transformer=None,
                  **pmml_options):
    mapper = DataFrameMapper(
        [([column], [
            CategoricalDomain(missing_values=-1),
            CategoricalImputer(missing_values=-1),
            PMMLLabelBinarizer()
        ]) for column in ["cylinders", "model_year"]] +
        [(["origin"], [CategoricalImputer(missing_values=-1),
                       OneHotEncoder()])] +
        [(["acceleration"], [
            ContinuousDomain(missing_values=None),
            CutTransformer(bins=[5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25],
                           labels=False),
            CategoricalImputer(),
            LabelBinarizer()
        ])] + [(["displacement"], [
            ContinuousDomain(missing_values=None),
            Imputer(),
            CutTransformer(bins=[0, 100, 200, 300, 400, 500],
                           labels=["XS", "S", "M", "L", "XL"]),
            LabelBinarizer()
        ])] + [(["horsepower"], [
            ContinuousDomain(missing_values=None,
                             outlier_treatment="as_extreme_values",
                             low_value=50,
                             high_value=225),
            Imputer()
        ])] + [(["weight"], [
            ContinuousDomain(missing_values=None,
                             outlier_treatment="as_extreme_values",
                             low_value=2000,
                             high_value=5000),
            Imputer()
        ])])
    pipeline = PMMLPipeline([("mapper", mapper), ("regressor", regressor)],
                            predict_transformer=predict_transformer,
                            apply_transformer=apply_transformer)
    pipeline.fit(auto_na_X, auto_na_y)
    if isinstance(regressor, DecisionTreeRegressor):
        tree = regressor.tree_
        node_impurity = {
            node_idx: tree.impurity[node_idx]
            for node_idx in range(0, tree.node_count)
            if tree.impurity[node_idx] != 0.0
        }
        pmml_options["node_extensions"] = {regressor.criterion: node_impurity}
    pipeline.configure(**pmml_options)
    store_pkl(pipeline, name + ".pkl")
    mpg = DataFrame(pipeline.predict(auto_na_X), columns=["mpg"])
    if isinstance(regressor, DecisionTreeRegressor):
        Xt = pipeline_transform(pipeline, auto_na_X)
        mpg_apply = DataFrame(regressor.apply(Xt), columns=["nodeId"])
        mpg = pandas.concat((mpg, mpg_apply), axis=1)
    store_csv(mpg, name + ".csv")
コード例 #6
0
	def test_transform_string(self):
		y = ["A", None, "B", "C"]
		binarizer = PMMLLabelBinarizer()
		binarizer.fit(y)
		self.assertEqual([[1, 0, 0], [0, 0, 1], [0, 0, 0], [0, 1, 0]], binarizer.transform(["A", "C", None, "B"]).tolist())
		self.assertEqual([[0, 0, 0]], binarizer.transform([None]).tolist())
		self.assertEqual([[1, 0, 0], [0, 1, 0], [0, 0, 1]], binarizer.transform(["A", "B", "C"]).tolist())
コード例 #7
0
 def test_fit_string(self):
     y = ["A", None, "A", "B", None, "C", "C", "B"]
     labels = ["A", "B", "C"]
     binarizer = PMMLLabelBinarizer()
     self.assertFalse(hasattr(binarizer, "classes_"))
     binarizer.fit(y)
     self.assertEqual(labels, binarizer.classes_.tolist())
     binarizer.fit(numpy.array(y))
     self.assertEqual(labels, binarizer.classes_.tolist())
     binarizer.fit(Series(numpy.array(y)))
     self.assertEqual(labels, binarizer.classes_.tolist())
コード例 #8
0
ファイル: main.py プロジェクト: waveleu/jpmml-sklearn
def build_audit_na(classifier, name, with_proba=True):
    employment_mapping = {
        "Consultant": "Private",
        "PSFederal": "Public",
        "PSLocal": "Public",
        "PSState": "Public",
        "SelfEmp": "Private",
        "Private": "Private"
    }
    gender_mapping = {"Female": 0, "Male": 1}
    mapper = DataFrameMapper(
        [([column], [ContinuousDomain(missing_values=None),
                     Imputer()])
         for column in ["Age", "Income", "Hours"]] + [("Employment", [
             CategoricalDomain(missing_values=None),
             CategoricalImputer(),
             LookupTransformer(employment_mapping, "Other"),
             PMMLLabelBinarizer()
         ])] + [([column], [
             CategoricalDomain(missing_values=None),
             CategoricalImputer(),
             PMMLLabelBinarizer()
         ]) for column in ["Education", "Marital", "Occupation"]] +
        [("Gender", [
            CategoricalDomain(missing_values=None),
            CategoricalImputer(),
            LookupTransformer(gender_mapping, None)
        ])])
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_na_X, audit_na_y)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"])
    if (with_proba == True):
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_na_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
コード例 #9
0
def build_auto_na_hist(regressor, name):
	mapper = DataFrameMapper(
		[([column], ContinuousDomain()) for column in ["displacement", "horsepower", "weight", "acceleration"]] +
		[([column], [CategoricalDomain(), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year", "origin"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	pipeline.fit(auto_na_X, auto_na_y)
	pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"])
	store_csv(mpg, name)
コード例 #10
0
ファイル: main.py プロジェクト: waveleu/jpmml-sklearn
def build_auto_na(regressor, name):
    mapper = DataFrameMapper([
        ([column], [ContinuousDomain(missing_values=None),
                    Imputer()])
        for column in ["acceleration", "displacement", "horsepower", "weight"]
    ] + [([column], [
        CategoricalDomain(missing_values=-1),
        CategoricalImputer(missing_values=-1),
        PMMLLabelBinarizer()
    ]) for column in ["cylinders", "model_year", "origin"]])
    pipeline = PMMLPipeline([("mapper", mapper), ("regressor", regressor)])
    pipeline.fit(auto_na_X, auto_na_y)
    store_pkl(pipeline, name + ".pkl")
    mpg = DataFrame(pipeline.predict(auto_na_X), columns=["mpg"])
    store_csv(mpg, name + ".csv")
コード例 #11
0
ファイル: main.py プロジェクト: dclong/jpmml-sklearn
def build_audit_na(classifier, name, with_proba = True):
	mapper = DataFrameMapper(
		[([column], [ContinuousDomain(missing_values = None), Imputer()]) for column in ["Age", "Income", "Hours"]] +
		[([column], [CategoricalDomain(missing_values = None), CategoricalImputer(), PMMLLabelBinarizer()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	pipeline.fit(audit_na_X, audit_na_y)
	store_pkl(pipeline, name + ".pkl")
	adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"])
	if(with_proba == True):
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name + ".csv")
コード例 #12
0
def build_audit_na_hist(classifier, name):
	mapper = DataFrameMapper(
		[([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] +
		[([column], [CategoricalDomain(), PMMLLabelBinarizer()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	pipeline.fit(audit_na_X, audit_na_y)
	pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"])
	adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"])
	adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
コード例 #13
0
def label_binarizer(name):
	return PMMLLabelBinarizer() if name.endswith("NA") else LabelBinarizer()
コード例 #14
0
def build_audit_na(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, predict_transformer = None, predict_proba_transformer = None, apply_transformer = None, **pmml_options):
	employment_mapping = {
		"CONSULTANT" : "PRIVATE",
		"PSFEDERAL" : "PUBLIC",
		"PSLOCAL" : "PUBLIC",
		"PSSTATE" : "PUBLIC",
		"SELFEMP" : "PRIVATE",
		"PRIVATE" : "PRIVATE"
	}
	gender_mapping = {
		"FEMALE" : 0.0,
		"MALE" : 1.0,
		"MISSING_VALUE" : 0.5
	}
	mapper = DataFrameMapper(
		[(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999", dtype = int), name = "flag_missing(Age, -999)"), SimpleImputer(missing_values = -999, strategy = "constant", fill_value = 38)])] +
		[(["Age"], MissingIndicator())] +
		[(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), SimpleImputer(missing_values = -999, add_indicator = True)])] +
		[(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), SimpleImputer(strategy = "median", add_indicator = True)])] +
		[(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] +
		[([column], [CategoricalDomain(missing_values = None, missing_value_replacement = "N/A", with_data = False), SimpleImputer(missing_values = "N/A", strategy = "most_frequent"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] +
		[(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), SimpleImputer(strategy = "constant"), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	], predict_transformer = predict_transformer, predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer)
	pipeline.fit(audit_na_X, audit_na_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	if isinstance(classifier, DecisionTreeClassifier):
		Xt = pipeline_transform(pipeline, audit_na_X)
		adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"])
		adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1)
	store_csv(adjusted, name)
コード例 #15
0
                    max_depth=3,
                    random_state=13,
                    categorical_feature=lightgbm_categorical_feature))
])

xgboost_mapper = make_xgboost_column_transformer(dtypes,
                                                 missing_value_aware=True)
xgboost_pipeline = Pipeline([("mapper", xgboost_mapper),
                             ("classifier",
                              XGBClassifier(n_estimators=31,
                                            learning_rate=0.1,
                                            max_depth=3,
                                            random_state=13))])

sklearn_mapper = ColumnTransformer(
    [(str(cat_index), PMMLLabelBinarizer(sparse_output=False), [cat_index])
     for cat_index in range(0, len(cat_columns))] +
    [(str(cont_index), "passthrough", [cont_index])
     for cont_index in range(len(cat_columns), len(cat_columns + cont_columns))
     ],
    remainder="drop")

sklearn_pipeline = Pipeline([("mapper", sklearn_mapper),
                             ("classifier",
                              HistGradientBoostingClassifier(max_iter=31,
                                                             max_depth=3,
                                                             random_state=13))
                             ])

final_estimator = LogisticRegression(multi_class="ovr", random_state=13)
コード例 #16
0
def build_audit_na(classifier,
                   name,
                   with_proba=True,
                   predict_proba_transformer=None,
                   apply_transformer=None,
                   **pmml_options):
    employment_mapping = {
        "CONSULTANT": "PRIVATE",
        "PSFEDERAL": "PUBLIC",
        "PSLOCAL": "PUBLIC",
        "PSSTATE": "PUBLIC",
        "SELFEMP": "PRIVATE",
        "PRIVATE": "PRIVATE"
    }
    gender_mapping = {"FEMALE": 0, "MALE": 1}
    mapper = DataFrameMapper([(["Age"], [
        ContinuousDomain(missing_values=None, with_data=False),
        Alias(ExpressionTransformer(
            "numpy.where(pandas.notnull(X[:, 0]), X[:, 0], -999)"),
              name="flag_missing(Age, -999)"),
        Imputer(missing_values=-999)
    ])] + [(["Hours"], [
        ContinuousDomain(missing_values=None, with_data=False),
        Alias(ExpressionTransformer(
            "numpy.where(pandas.isnull(X[:, 0]), -999, X[:, 0])"),
              name="flag_missing(Hours, -999)"),
        Imputer(missing_values=-999)
    ])] + [(["Income"], [
        ContinuousDomain(missing_values=None,
                         outlier_treatment="as_missing_values",
                         low_value=5000,
                         high_value=200000,
                         with_data=False),
        Imputer()
    ])] + [(["Employment"], [
        CategoricalDomain(missing_values=None, with_data=False),
        CategoricalImputer(),
        StringNormalizer(function="uppercase"),
        LookupTransformer(employment_mapping, "OTHER"),
        StringNormalizer(function="lowercase"),
        PMMLLabelBinarizer()
    ])] + [([column], [
        CategoricalDomain(missing_values=None, with_data=False),
        CategoricalImputer(missing_values=None),
        StringNormalizer(function="lowercase"),
        PMMLLabelBinarizer()
    ]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [
        CategoricalDomain(missing_values=None, with_data=False),
        CategoricalImputer(),
        StringNormalizer(function="uppercase"),
        LookupTransformer(gender_mapping, None)
    ])])
    pipeline = PMMLPipeline(
        [("mapper", mapper), ("classifier", classifier)],
        predict_proba_transformer=predict_proba_transformer,
        apply_transformer=apply_transformer)
    pipeline.fit(audit_na_X, audit_na_y)
    pipeline.configure(**pmml_options)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"])
    if with_proba == True:
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_na_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    if isinstance(classifier, DecisionTreeClassifier):
        Xt = pipeline_transform(pipeline, audit_na_X)
        adjusted_apply = DataFrame(classifier.apply(Xt), columns=["nodeId"])
        adjusted = pandas.concat((adjusted, adjusted_apply), axis=1)
    store_csv(adjusted, name + ".csv")
コード例 #17
0
ファイル: __init__.py プロジェクト: maxnoe/sklearn2pmml
 def test_transform_string(self):
     X = ["A", None, "B", "C"]
     dense_binarizer = PMMLLabelBinarizer()
     dense_binarizer.fit(X)
     Xt_dense = dense_binarizer.transform(["A", "C", None, "B"])
     self.assertIsInstance(Xt_dense, numpy.ndarray)
     self.assertEqual([[1, 0, 0], [0, 0, 1], [0, 0, 0], [0, 1, 0]],
                      Xt_dense.tolist())
     self.assertEqual([[0, 0, 0]],
                      dense_binarizer.transform([None]).tolist())
     self.assertEqual([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
                      dense_binarizer.transform(["A", "B", "C"]).tolist())
     sparse_binarizer = PMMLLabelBinarizer(sparse_output=True)
     sparse_binarizer.fit(X)
     Xt_sparse = sparse_binarizer.transform(["A", "C", None, "B"])
     self.assertIsInstance(Xt_sparse, scipy.sparse.csr_matrix)
     self.assertEqual(Xt_dense.tolist(), Xt_sparse.toarray().tolist())
コード例 #18
0
 def test_fit_float(self):
     y = [1.0, float("NaN"), 1.0, 2.0, float("NaN"), 3.0, 3.0, 2.0]
     labels = [1.0, 2.0, 3.0]
     binarizer = PMMLLabelBinarizer()
     binarizer.fit(y)
     self.assertEqual(labels, binarizer.classes_.tolist())