Exemple #1
0
def build_auto(regressor, name, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), LabelBinarizer()]),
		(["cylinders"], Alias(ExpressionTransformer("X[0] % 2.0 > 0.0"), name = "odd(cylinders)", prefit = True)),
		(["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects
		(["model_year", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), ConcatTransformer("/"), LabelBinarizer(), SelectorProxy(SelectFromModel(RandomForestRegressor(random_state = 13, n_estimators = 3), threshold = "1.25 * mean"))]),
		(["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]),
		(["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {"alias" : "weight / displacement + 0.5"})
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name)
Exemple #2
0
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders"], [CategoricalDomain(), Alias(ExpressionTransformer("X[0] % 2.0 > 0.0", dtype = numpy.int8), name = "odd(cylinders)", prefit = True)]),
		(["cylinders", "origin"], [MultiDomain([None, CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), OneHotEncoder()]),
		(["model_year"], [CategoricalDomain(), CastTransformer(str), ExpressionTransformer("'19' + X[0] + '-01-01'"), CastTransformer("datetime64[D]"), DaysSinceYearTransformer(1977), Binarizer(threshold = 0)], {"alias" : "bin(model_year, 1977)"}),
		(["model_year", "origin"], [ConcatTransformer("/"), OneHotEncoder(sparse = False), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]),
		(["weight", "displacement"], [ContinuousDomain(), ExpressionTransformer("(X[0] / X[1]) + 0.5", dtype = numpy.float64)], {"alias" : "weight / displacement + 0.5"}),
		(["displacement", "horsepower", "weight", "acceleration"], [MultiDomain([None, ContinuousDomain(), None, ContinuousDomain()]), StandardScaler()])
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params)
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"])
	store_csv(mpg, name)
Exemple #3
0
	def test_sequence_transform(self):
		mapper = DataFrameMapper([
			(["a"], [ExpressionTransformer("0 if pandas.isnull(X[0]) else X[0]"), Imputer(missing_values = 0)])
		])
		X = DataFrame([[None], [1], [None]], columns = ["a"])
		Xt = mapper.fit_transform(X)
		self.assertEqual([[1], [1], [1]], Xt.tolist())
Exemple #4
0
	def test_timedelta_seconds(self):
		X = DataFrame([["2018-12-31T23:59:59", "2019-01-01T00:00:00"], ["2019-01-01T03:30:03", "2019-01-01T00:00:00"]], columns = ["left", "right"])
		mapper = DataFrameMapper([
			(["left", "right"], [DateTimeDomain(), SecondsSinceYearTransformer(year = 2010), ExpressionTransformer("X[0] - X[1]")])
		])
		Xt = mapper.fit_transform(X)
		self.assertEqual([[-1], [12603]], Xt.tolist())
Exemple #5
0
def build_auto(regressor, name, **kwargs):
    mapper = DataFrameMapper([
        (["cylinders"], CategoricalDomain()),
        (["displacement", "horsepower", "weight",
          "acceleration"], [ContinuousDomain(),
                            StandardScaler()]),
        (["model_year"], [CategoricalDomain(),
                          Binarizer(threshold=77)], {
                              "alias": "bin(model_year, 77)"
                          }),  # Pre/post 1973 oil crisis effects
        (["origin"], OneHotEncoder()),
        (["weight", "displacement"],
         ExpressionTransformer("(X[:, 0] / X[:, 1]) + 0.5"), {
             "alias": "weight / displacement + 0.5"
         })
    ])
    pipeline = Pipeline([("mapper", mapper), ("regressor", regressor)])
    pipeline.fit(auto_X, auto_y)
    pipeline = make_pmml_pipeline(pipeline, auto_X.columns.values, auto_y.name)
    if isinstance(regressor, XGBRegressor):
        pipeline.verify(auto_X.sample(frac=0.05, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(auto_X.sample(frac=0.05, random_state=13))
    customize(regressor, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"])
    store_csv(mpg, name + ".csv")
Exemple #6
0
 def test_timedelta_days(self):
     X = DataFrame(
         [["2018-12-31", "2019-01-01"], ["2019-01-31", "2019-01-01"]],
         columns=["left", "right"])
     pipeline = Pipeline([
         ("union",
          FeatureUnion([
              ("left_mapper",
               DataFrameMapper([
                   ("left",
                    [DateDomain(),
                     DaysSinceYearTransformer(year=2010)])
               ])),
              ("right_mapper",
               DataFrameMapper([
                   ("right",
                    [DateDomain(),
                     DaysSinceYearTransformer(year=2010)])
               ]))
          ])),
         ("expression",
          Alias(ExpressionTransformer("X[0] - X[1]"),
                "delta(left, right)",
                prefit=True))
     ])
     Xt = pipeline.fit_transform(X)
     self.assertEqual([[-1], [30]], Xt.tolist())
Exemple #7
0
def build_auto(regressor, name, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(6, 3) : "6/3",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), LabelBinarizer()]),
		(["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects
		(["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]),
		(["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {"alias" : "weight / displacement + 0.5"})
	])
	pipeline = Pipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y)
	pipeline = make_pmml_pipeline(pipeline, auto_X.columns.values, auto_y.name)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name + ".pkl")
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name + ".csv")
Exemple #8
0
def build_audit_na(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, predict_transformer = None, predict_proba_transformer = None, apply_transformer = None, **pmml_options):
	employment_mapping = {
		"CONSULTANT" : "PRIVATE",
		"PSFEDERAL" : "PUBLIC",
		"PSLOCAL" : "PUBLIC",
		"PSSTATE" : "PUBLIC",
		"SELFEMP" : "PRIVATE",
		"PRIVATE" : "PRIVATE"
	}
	gender_mapping = {
		"FEMALE" : 0.0,
		"MALE" : 1.0,
		"MISSING_VALUE" : 0.5
	}
	mapper = DataFrameMapper(
		[(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999", dtype = int), name = "flag_missing(Age, -999)"), SimpleImputer(missing_values = -999, strategy = "constant", fill_value = 38)])] +
		[(["Age"], MissingIndicator())] +
		[(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), SimpleImputer(missing_values = -999, add_indicator = True)])] +
		[(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), SimpleImputer(strategy = "median", add_indicator = True)])] +
		[(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] +
		[([column], [CategoricalDomain(missing_values = None, missing_value_replacement = "N/A", with_data = False), SimpleImputer(missing_values = "N/A", strategy = "most_frequent"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] +
		[(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), SimpleImputer(strategy = "constant"), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	], predict_transformer = predict_transformer, predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer)
	pipeline.fit(audit_na_X, audit_na_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	if isinstance(classifier, DecisionTreeClassifier):
		Xt = pipeline_transform(pipeline, audit_na_X)
		adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"])
		adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1)
	store_csv(adjusted, name)
Exemple #9
0
def build_audit_na(classifier, name, with_proba = True, predict_proba_transformer = None, apply_transformer = None, **pmml_options):
	employment_mapping = {
		"CONSULTANT" : "PRIVATE",
		"PSFEDERAL" : "PUBLIC",
		"PSLOCAL" : "PUBLIC",
		"PSSTATE" : "PUBLIC",
		"SELFEMP" : "PRIVATE",
		"PRIVATE" : "PRIVATE"
	}
	gender_mapping = {
		"FEMALE" : 0,
		"MALE" : 1
	}
	mapper = DataFrameMapper(
		[(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999"), name = "flag_missing(Age, -999)"), Imputer(missing_values = -999)])] +
		[(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), Imputer(missing_values = -999)])] +
		[(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), Imputer()])] +
		[(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] +
		[([column], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] +
		[(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	], predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer)
	pipeline.fit(audit_na_X, audit_na_y)
	pipeline.configure(**pmml_options)
	store_pkl(pipeline, name + ".pkl")
	adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	if isinstance(classifier, DecisionTreeClassifier):
		Xt = pipeline_transform(pipeline, audit_na_X)
		adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"])
		adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1)
	store_csv(adjusted, name + ".csv")
Exemple #10
0
def build_auto(regressor, name, **kwargs):
	mapper = DataFrameMapper([
		(["cylinders"], CategoricalDomain()),
		(["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(missing_values = None), Imputer(missing_values = "NaN"), StandardScaler()]),
		(["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects
		(["origin"], OneHotEncoder()),
		(["weight", "displacement"], ExpressionTransformer("(X[:, 0] / X[:, 1]) + 0.5"), {"alias" : "weight / displacement + 0.5"})
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y)
	customize(regressor, **kwargs)
	store_pkl(pipeline, name + ".pkl")
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name + ".csv")
Exemple #11
0
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_na_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    if isinstance(classifier, DecisionTreeClassifier):
        Xt = pipeline_transform(pipeline, audit_na_X)
        adjusted_apply = DataFrame(classifier.apply(Xt), columns=["nodeId"])
        adjusted = pandas.concat((adjusted, adjusted_apply), axis=1)
    store_csv(adjusted, name + ".csv")


if "Audit" in datasets:
    build_audit_na(DecisionTreeClassifier(random_state=13, min_samples_leaf=5),
                   "DecisionTreeAuditNA",
                   apply_transformer=Alias(
                       ExpressionTransformer("X[:, 0] - 1"),
                       "eval(nodeId)",
                       prefit=True),
                   winner_id=True,
                   class_extensions={"event": {
                       "0": False,
                       "1": True
                   }})
    build_audit_na(
        LogisticRegression(solver="newton-cg", max_iter=500),
        "LogisticRegressionAuditNA",
        predict_proba_transformer=Alias(
            ExpressionTransformer("numpy.where(X[:, 1] > 0.75, 1, 0)"),
            name="eval(probability(1))",
            prefit=True))
Exemple #12
0
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	if isinstance(classifier, DecisionTreeClassifier):
		Xt = pipeline_transform(pipeline, audit_na_X)
		adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"])
		adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1)
	store_csv(adjusted, name)

if "Audit" in datasets:
	build_audit_na(DecisionTreeClassifier(min_samples_leaf = 5, random_state = 13), "DecisionTreeAuditNA", apply_transformer = Alias(ExpressionTransformer("X[0] - 1"), "eval(nodeId)", prefit = True), winner_id = True, class_extensions = {"event" : {"0" : False, "1" : True}})
	build_audit_na(LogisticRegression(multi_class = "ovr", solver = "newton-cg", max_iter = 500), "LogisticRegressionAuditNA", predict_proba_transformer = Alias(ExpressionTransformer("1 if X[1] > 0.75 else 0"), name = "eval(probability(1))", prefit = True))
	build_audit_na(XGBClassifier(objective = "binary:logistic", random_state = 13), "XGBAuditNA", predict_params = {"ntree_limit" : 71}, predict_proba_params = {"ntree_limit" : 71}, predict_transformer = Alias(ExpressionTransformer("X[0]"), name = "eval(Adjusted)", prefit = True), ntree_limit = 71)

def build_audit_na_hist(classifier, name):
	mapper = DataFrameMapper(
		[([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] +
		[([column], [CategoricalDomain(), PMMLLabelBinarizer()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	pipeline.fit(audit_na_X, audit_na_y)
	pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
h2o.init()

mapper = DataFrameMapper([("Education", CategoricalDomain()),
                          ("Employment", CategoricalDomain()),
                          ("Gender", CategoricalDomain()),
                          ("Marital", CategoricalDomain()),
                          ("Occupation", CategoricalDomain()),
                          ("Age", [
                              ContinuousDomain(),
                              CutTransformer(bins=[17, 28, 37, 47, 83],
                                             labels=["q1", "q2", "q3", "q4"])
                          ]), ("Hours", ContinuousDomain()),
                          ("Income", ContinuousDomain()),
                          (["Hours", "Income"],
                           Alias(ExpressionTransformer("X[1] / (X[0] * 52)"),
                                 "Hourly_Income"))])
classifier = H2ORandomForestEstimator(ntrees=17)

predict_proba_transformer = Pipeline([
    ("expression", ExpressionTransformer("X[1]")),
    ("cut",
     Alias(CutTransformer(bins=[0.0, 0.75, 0.90, 1.0],
                          labels=["no", "maybe", "yes"]),
           "Decision",
           prefit=True))
])

pipeline = PMMLPipeline([("local_mapper", mapper),
                         ("uploader", H2OFrameCreator()),
                         ("remote_classifier", classifier)],
Exemple #14
0
 def test_transform(self):
     transformer = ExpressionTransformer("X['a'] + X['b']", dtype=int)
     self.assertTrue(hasattr(transformer, "expr"))
     self.assertTrue(hasattr(transformer, "dtype"))
     X = DataFrame([[0.5, 0.5], [1.0, 2.0]], columns=["a", "b"])
     Xt = transformer.fit_transform(X)
     self.assertIsInstance(Xt, numpy.ndarray)
     self.assertEqual(int, Xt.dtype)
     self.assertEqual([[1], [3]], Xt.tolist())
     transformer = ExpressionTransformer("X['a'] + X['b']", dtype=float)
     Xt = transformer.fit_transform(X)
     self.assertIsInstance(Xt, numpy.ndarray)
     self.assertEqual(float, Xt.dtype)
     self.assertEqual([[1.0], [3.0]], Xt.tolist())
     transformer = ExpressionTransformer("X[0] + X[1]")
     self.assertTrue(hasattr(transformer, "expr"))
     self.assertTrue(hasattr(transformer, "dtype"))
     self.assertIsNone(transformer.dtype)
     X = numpy.array([[0.5, 0.5], [1.0, 2.0]])
     Xt = transformer.fit_transform(X)
     self.assertIsInstance(Xt, numpy.ndarray)
     self.assertEqual([[1], [3]], Xt.tolist())
     transformer = ExpressionTransformer("X[0] - X[1]")
     self.assertEqual([[0.0], [-1.0]],
                      transformer.fit_transform(X).tolist())
     transformer = ExpressionTransformer("X[0] * X[1]")
     self.assertEqual([[0.25], [2.0]],
                      transformer.fit_transform(X).tolist())
     transformer = ExpressionTransformer("X[0] / X[1]")
     self.assertEqual([[1.0], [0.5]], transformer.fit_transform(X).tolist())
Exemple #15
0
 def test_transform(self):
     transformer = ExpressionTransformer("X[:, 0] + X[:, 1]")
     self.assertTrue(hasattr(transformer, "expr_"))
     X = numpy.array([[0.5, 0.5], [1.0, 2.0]])
     Xt = transformer.fit_transform(X)
     self.assertEqual([1.0, 3.0], Xt.tolist())
     transformer = ExpressionTransformer("X[:, 0] - X[:, 1]")
     Xt = transformer.fit_transform(X)
     self.assertEqual([0.0, -1.0], Xt.tolist())
     transformer = ExpressionTransformer("X[:, 0] * X[:, 1]")
     Xt = transformer.fit_transform(X)
     self.assertEqual([0.25, 2.0], Xt.tolist())
     transformer = ExpressionTransformer("X[:, 0] / X[:, 1]")
     Xt = transformer.fit_transform(X)
     self.assertEqual([1.0, 0.5], Xt.tolist())
Exemple #16
0
    ("Employment",
     [CategoricalDomain(),
      LabelBinarizer(),
      SelectKBest(chi2, k=3)]),
    ("Occupation",
     [CategoricalDomain(),
      LabelBinarizer(),
      SelectKBest(chi2, k=3)]),
    ("Age", [
        ContinuousDomain(),
        CutTransformer(bins=[17, 28, 37, 47, 83],
                       labels=["q1", "q2", "q3", "q4"]),
        LabelBinarizer()
    ]), ("Hours", ContinuousDomain()), ("Income", ContinuousDomain()),
    (["Hours", "Income"],
     Alias(ExpressionTransformer("X[1] / (X[0] * 52)"), "Hourly_Income"))
])
interaction_mapper = DataFrameMapper([
    ("Gender", [CategoricalDomain(), LabelBinarizer()]),
    ("Marital", [CategoricalDomain(), LabelBinarizer()])
])
classifier = XGBClassifier()

pipeline = PMMLPipeline([
    ("mapper",
     FeatureUnion([("scalar_mapper", scalar_mapper),
                   ("interaction",
                    Pipeline([("interaction_mapper", interaction_mapper),
                              ("polynomial", PolynomialFeatures())]))])),
    ("classifier", classifier)
])
Exemple #17
0
    if with_proba == True:
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_na_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    if isinstance(classifier, DecisionTreeClassifier):
        Xt = pipeline_transform(pipeline, audit_na_X)
        adjusted_apply = DataFrame(classifier.apply(Xt), columns=["nodeId"])
        adjusted = pandas.concat((adjusted, adjusted_apply), axis=1)
    store_csv(adjusted, name + ".csv")


if "Audit" in datasets:
    build_audit_na(DecisionTreeClassifier(random_state=13, min_samples_leaf=5),
                   "DecisionTreeAuditNA",
                   apply_transformer=Alias(ExpressionTransformer("X[0] - 1"),
                                           "eval(nodeId)",
                                           prefit=True),
                   winner_id=True,
                   class_extensions={"event": {
                       "0": False,
                       "1": True
                   }})
    build_audit_na(LogisticRegression(solver="newton-cg", max_iter=500),
                   "LogisticRegressionAuditNA",
                   predict_proba_transformer=Alias(
                       ExpressionTransformer("1 if X[1] > 0.75 else 0"),
                       name="eval(probability(1))",
                       prefit=True))
    build_audit_na(OptimalXGBClassifier(objective="binary:logistic",
                                        ntree_limit=11,
Exemple #18
0
	pipeline.fit(audit_na_X, audit_na_y, **fit_params)
	pipeline.configure(**pmml_options)
	pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	if isinstance(classifier, DecisionTreeClassifier):
		Xt = pipeline_transform(pipeline, audit_na_X)
		adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"])
		adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1)
	store_csv(adjusted, name)

if "Audit" in datasets:
	build_audit_na(DecisionTreeClassifier(min_samples_leaf = 5, random_state = 13), "DecisionTreeAuditNA", apply_transformer = Alias(ExpressionTransformer("X[0] - 1"), "eval(nodeId)", prefit = True), winner_id = True, class_extensions = {"event" : {"0" : False, "1" : True}})
	build_audit_na(LogisticRegression(solver = "newton-cg", max_iter = 500), "LogisticRegressionAuditNA", predict_proba_transformer = Alias(ExpressionTransformer("1 if X[1] > 0.75 else 0"), name = "eval(probability(1))", prefit = True))

def build_audit_na_direct(classifier, name):
	mapper = DataFrameMapper([
		(["Age", "Hours", "Income"], None),
		(["Employment", "Education", "Marital", "Occupation", "Gender"], OneHotEncoder())
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	pipeline.fit(audit_na_X, audit_na_y)
	pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"])
Exemple #19
0
store_csv(df, "Apollo")


def build_apollo(mapper, name):
    pipeline = PMMLPipeline([("mapper", mapper),
                             ("classifier", DecisionTreeClassifier())])
    pipeline.fit(df, df["success"])
    store_pkl(pipeline, name)
    success = DataFrame(pipeline.predict(df), columns=["success"])
    success_proba = DataFrame(
        pipeline.predict_proba(df),
        columns=["probability(false)", "probability(true)"])
    success = pandas.concat((success, success_proba), axis=1)
    store_csv(success, name)


mapper = DataFrameMapper([(["launch", "return"], [
    DateTimeDomain(),
    DaysSinceYearTransformer(year=1968),
    ExpressionTransformer("X[1] - X[0]")
])])

build_apollo(mapper, "DurationInDaysApollo")

mapper = DataFrameMapper([(["launch", "return"], [
    DateTimeDomain(),
    SecondsSinceYearTransformer(year=1968),
    ExpressionTransformer("X[1] - X[0]")
])])

build_apollo(mapper, "DurationInSecondsApollo")
Exemple #20
0
        result = pandas.concat([result, probabilities], axis=1)
    store_csv(result, name)


audit_data = load_audit("Audit")

audit_feature_pipeline = Pipeline([
    ("mapper",
     DataFrameMapper(
         [(cat_column, [CategoricalDomain(),
                        LabelBinarizer()]) for cat_column in
          ["Employment", "Education", "Marital", "Occupation", "Gender"]] +
         [(cont_column, ContinuousDomain())
          for cont_column in ["Age", "Income", "Hours"]] +
         [(["Income", "Hours"],
           Alias(ExpressionTransformer("X[0] / (X[1] * 52.0)"),
                 "Hourly_Income",
                 prefit=True))],
         df_out=True))
])

build_classifier(audit_data, audit_feature_pipeline, 3, 7, "TPOTAudit")

iris_data = load_iris("Iris")

iris_feature_pipeline = Pipeline([
    ("mapper",
     DataFrameMapper([(iris_data[0].columns.values, ContinuousDomain())]))
])

build_classifier(iris_data, iris_feature_pipeline, 7, 17, "TPOTIris")