Esempio n. 1
0
def build_auto_na(regressor, name, predict_transformer = None, apply_transformer = None, **pmml_options):
	mapper = DataFrameMapper(
		[([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year"]] +
		[(["origin"], [CategoricalDomain(missing_values = -1), SimpleImputer(missing_values = -1, strategy = "most_frequent"), OneHotEncoder()])] +
		[(["acceleration"], [ContinuousDomain(missing_values = None), CutTransformer(bins = [5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25], labels = False), CategoricalImputer(), LabelBinarizer()])] +
		[(["displacement"], [ContinuousDomain(missing_values = None), SimpleImputer(), CutTransformer(bins = [0, 100, 200, 300, 400, 500], labels = ["XS", "S", "M", "L", "XL"]), LabelBinarizer()])] +
		[(["horsepower"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 50, high_value = 225), SimpleImputer(strategy = "median")])] +
		[(["weight"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 2000, high_value = 5000), SimpleImputer(strategy = "median")])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	], predict_transformer = predict_transformer, apply_transformer = apply_transformer)
	pipeline.fit(auto_na_X, auto_na_y)
	if isinstance(regressor, DecisionTreeRegressor):
		tree = regressor.tree_
		node_impurity = {node_idx : tree.impurity[node_idx] for node_idx in range(0, tree.node_count) if tree.impurity[node_idx] != 0.0}
		pmml_options["node_extensions"] = {regressor.criterion : node_impurity}
	pipeline.configure(**pmml_options)
	pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"])
	if isinstance(regressor, DecisionTreeRegressor):
		Xt = pipeline_transform(pipeline, auto_na_X)
		mpg_apply = DataFrame(regressor.apply(Xt), columns = ["nodeId"])
		mpg = pandas.concat((mpg, mpg_apply), axis = 1)
	store_csv(mpg, name)
Esempio n. 2
0
def build_auto_na(regressor, name):
    mapper = DataFrameMapper(
        [([column], [
            CategoricalDomain(missing_values=-1),
            CategoricalImputer(missing_values=-1),
            PMMLLabelBinarizer()
        ]) for column in ["cylinders", "model_year"]] +
        [(["origin"], [CategoricalImputer(missing_values=-1),
                       OneHotEncoder()])] +
        [(["acceleration"], [
            ContinuousDomain(missing_values=None),
            CutTransformer(bins=[5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25],
                           labels=False),
            CategoricalImputer(),
            LabelBinarizer()
        ])] + [(["displacement"], [
            ContinuousDomain(missing_values=None),
            Imputer(),
            CutTransformer(bins=[0, 100, 200, 300, 400, 500],
                           labels=["XS", "S", "M", "L", "XL"]),
            LabelBinarizer()
        ])] + [(["horsepower"], [
            ContinuousDomain(missing_values=None,
                             outlier_treatment="as_extreme_values",
                             low_value=50,
                             high_value=225),
            Imputer()
        ])] + [(["weight"], [
            ContinuousDomain(missing_values=None,
                             outlier_treatment="as_extreme_values",
                             low_value=2000,
                             high_value=5000),
            Imputer()
        ])])
    pipeline = PMMLPipeline([("mapper", mapper), ("regressor", regressor)])
    pipeline.fit(auto_na_X, auto_na_y)
    if isinstance(regressor, DecisionTreeRegressor):
        tree = regressor.tree_
        node_impurity = {
            node_idx: tree.impurity[node_idx]
            for node_idx in range(0, tree.node_count)
            if tree.impurity[node_idx] != 0.0
        }
        pipeline.configure(
            node_extensions={regressor.criterion: node_impurity})
    store_pkl(pipeline, name + ".pkl")
    mpg = DataFrame(pipeline.predict(auto_na_X), columns=["mpg"])
    store_csv(mpg, name + ".csv")
Esempio n. 3
0
def build_audit_cat(classifier, name, with_proba = True, fit_params = {}):
	marital_mapping = {
		"Married-spouse-absent" : "Married"
	}
	mapper = DataFrameMapper(
		[([column], ContinuousDomain(display_name = column)) for column in ["Age", "Income"]] +
		[(["Hours"], [ContinuousDomain(display_name = "Hours"), CutTransformer(bins = [0, 20, 40, 60, 80, 100], labels = False, right = False, include_lowest = True)])] +
		[(["Employment", "Education"], [MultiDomain([CategoricalDomain(display_name = "Employment"), CategoricalDomain(display_name = "Education")]), OrdinalEncoder(dtype = numpy.int_)])] +
		[(["Marital"], [CategoricalDomain(display_name = "Marital"), FilterLookupTransformer(marital_mapping), OrdinalEncoder(dtype = numpy.uint16)])] +
		[(["Occupation"], [CategoricalDomain(display_name = "Occupation"), OrdinalEncoder(dtype = numpy.float_)])] +
		[([column], [CategoricalDomain(display_name = column), LabelEncoder()]) for column in ["Gender", "Deductions"]]
	)
	pipeline = Pipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, audit_y, **fit_params)
	pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name)
	pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
Esempio n. 4
0
def build_audit_cat(classifier, name, with_proba=True, **fit_params):
    mapper = DataFrameMapper(
        [([column], ContinuousDomain())
         for column in ["Age", "Income"]] + [(["Hours"], [
             ContinuousDomain(),
             CutTransformer(bins=[0, 20, 40, 60, 80, 100],
                            labels=False,
                            right=False,
                            include_lowest=True)
         ])] +
        [([column], [CategoricalDomain(), LabelEncoder()]) for column in [
            "Employment", "Education", "Marital", "Occupation", "Gender",
            "Deductions"
        ]])
    pipeline = Pipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_X, audit_y, **fit_params)
    pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values,
                                  audit_y.name)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    if with_proba == True:
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
Esempio n. 5
0
		tree = regressor.tree_
		node_impurity = {node_idx : tree.impurity[node_idx] for node_idx in range(0, tree.node_count) if tree.impurity[node_idx] != 0.0}
		pmml_options["node_extensions"] = {regressor.criterion : node_impurity}
	pipeline.configure(**pmml_options)
	pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"])
	if isinstance(regressor, DecisionTreeRegressor):
		Xt = pipeline_transform(pipeline, auto_na_X)
		mpg_apply = DataFrame(regressor.apply(Xt), columns = ["nodeId"])
		mpg = pandas.concat((mpg, mpg_apply), axis = 1)
	store_csv(mpg, name)

if "Auto" in datasets:
	build_auto_na(DecisionTreeRegressor(min_samples_leaf = 2, random_state = 13), "DecisionTreeAutoNA", apply_transformer = Alias(ExpressionTransformer("X[0] - 1"), "eval(nodeId)", prefit = True), winner_id = True)
	build_auto_na(LinearRegression(), "LinearRegressionAutoNA", predict_transformer = CutTransformer(bins = [0, 10, 20, 30, 40], labels = ["0-10", "10-20", "20-30", "30-40"]))

auto_na_X, auto_na_y = load_auto("AutoNA")

auto_na_X["cylinders"] = auto_na_X["cylinders"].astype("Int64")
auto_na_X["model_year"] = auto_na_X["model_year"].astype("Int64")
auto_na_X["origin"] = auto_na_X["origin"].astype("Int64")

def build_auto_na_hist(regressor, name):
	mapper = DataFrameMapper(
		[([column], ContinuousDomain()) for column in ["displacement", "horsepower", "weight", "acceleration"]] +
		[([column], [CategoricalDomain(), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year", "origin"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
Esempio n. 6
0
 def test_transform(self):
     bins = [float("-inf"), -1.0, 0.0, 1.0, float("+inf")]
     transformer = CutTransformer(bins, labels=False, right=True)
     X = numpy.array([-2.0, -1.0, -0.5, 0.0, 0.5, 1.0, 2.0])
     self.assertEqual([[0], [0], [1], [1], [2], [2], [3]],
                      transformer.transform(X).tolist())
     transformer = CutTransformer(bins, labels=False, right=False)
     self.assertEqual([[0], [1], [1], [2], [2], [3], [3]],
                      transformer.transform(X).tolist())
     bins = [-3.0, -1.0, 1.0, 3.0]
     transformer = CutTransformer(bins,
                                  labels=False,
                                  right=True,
                                  include_lowest=True)
     X = numpy.array([-3.0, -2.0, 2.0, 3.0])
     self.assertEqual([[0], [0], [2], [2]],
                      transformer.transform(X).tolist())
     X = numpy.array([-5.0])
     self.assertTrue(numpy.isnan(transformer.transform(X)).tolist()[0])
     X = numpy.array([5.0])
     self.assertTrue(numpy.isnan(transformer.transform(X)).tolist()[0])
Esempio n. 7
0
        mpg_apply = DataFrame(regressor.apply(Xt), columns=["nodeId"])
        mpg = pandas.concat((mpg, mpg_apply), axis=1)
    store_csv(mpg, name + ".csv")


if "Auto" in datasets:
    build_auto_na(DecisionTreeRegressor(random_state=13, min_samples_leaf=2),
                  "DecisionTreeAutoNA",
                  apply_transformer=Alias(ExpressionTransformer("X[:, 0] - 1"),
                                          "eval(nodeId)",
                                          prefit=True),
                  winner_id=True)
    build_auto_na(LinearRegression(),
                  "LinearRegressionAutoNA",
                  predict_transformer=CutTransformer(
                      bins=[0, 10, 20, 30, 40],
                      labels=["0-10", "10-20", "20-30", "30-40"]))

housing_X, housing_y = load_housing("Housing.csv")


def build_housing(regressor, name, with_kneighbors=False, **pmml_options):
    mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())])
    pipeline = Pipeline([("mapper", mapper),
                         ("transformer-pipeline",
                          Pipeline([
                              ("polynomial",
                               PolynomialFeatures(degree=2,
                                                  interaction_only=True,
                                                  include_bias=False)),
                              ("scaler", StandardScaler()),
Esempio n. 8
0
scalar_mapper = DataFrameMapper([
    ("Education",
     [CategoricalDomain(),
      LabelBinarizer(),
      SelectKBest(chi2, k=3)]),
    ("Employment",
     [CategoricalDomain(),
      LabelBinarizer(),
      SelectKBest(chi2, k=3)]),
    ("Occupation",
     [CategoricalDomain(),
      LabelBinarizer(),
      SelectKBest(chi2, k=3)]),
    ("Age", [
        ContinuousDomain(),
        CutTransformer(bins=[17, 28, 37, 47, 83],
                       labels=["q1", "q2", "q3", "q4"]),
        LabelBinarizer()
    ]), ("Hours", ContinuousDomain()), ("Income", ContinuousDomain()),
    (["Hours", "Income"],
     Alias(ExpressionTransformer("X[1] / (X[0] * 52)"), "Hourly_Income"))
])
interaction_mapper = DataFrameMapper([
    ("Gender", [CategoricalDomain(), LabelBinarizer()]),
    ("Marital", [CategoricalDomain(), LabelBinarizer()])
])
classifier = XGBClassifier()

pipeline = PMMLPipeline([
    ("mapper",
     FeatureUnion([("scalar_mapper", scalar_mapper),
                   ("interaction",
Esempio n. 9
0
audit_df = pandas.read_csv("csv/Audit.csv")
#print(audit_df.head(5))

audit_X = audit_df[audit_df.columns.difference(["Adjusted"])]
audit_y = audit_df["Adjusted"]

h2o.init()

mapper = DataFrameMapper([("Education", CategoricalDomain()),
                          ("Employment", CategoricalDomain()),
                          ("Gender", CategoricalDomain()),
                          ("Marital", CategoricalDomain()),
                          ("Occupation", CategoricalDomain()),
                          ("Age", [
                              ContinuousDomain(),
                              CutTransformer(bins=[17, 28, 37, 47, 83],
                                             labels=["q1", "q2", "q3", "q4"])
                          ]), ("Hours", ContinuousDomain()),
                          ("Income", ContinuousDomain()),
                          (["Hours", "Income"],
                           Alias(ExpressionTransformer("X[1] / (X[0] * 52)"),
                                 "Hourly_Income"))])
classifier = H2ORandomForestEstimator(ntrees=17)

predict_proba_transformer = Pipeline([
    ("expression", ExpressionTransformer("X[1]")),
    ("cut",
     Alias(CutTransformer(bins=[0.0, 0.75, 0.90, 1.0],
                          labels=["no", "maybe", "yes"]),
           "Decision",
           prefit=True))
])