def build_auto(regressor, name, **pmml_options): cylinders_origin_mapping = { (8, 1) : "8/1", (6, 1) : "6/1", (4, 1) : "4/1", (6, 2) : "6/2", (4, 2) : "4/2", (4, 3) : "4/3" } mapper = DataFrameMapper([ (["cylinders", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), LabelBinarizer()]), (["cylinders"], Alias(ExpressionTransformer("X[0] % 2.0 > 0.0"), name = "odd(cylinders)", prefit = True)), (["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects (["model_year", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), ConcatTransformer("/"), LabelBinarizer(), SelectorProxy(SelectFromModel(RandomForestRegressor(random_state = 13, n_estimators = 3), threshold = "1.25 * mean"))]), (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]), (["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {"alias" : "weight / displacement + 0.5"}) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name)
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options): cylinders_origin_mapping = { (8, 1) : "8/1", (6, 1) : "6/1", (4, 1) : "4/1", (6, 2) : "6/2", (4, 2) : "4/2", (4, 3) : "4/3" } mapper = DataFrameMapper([ (["cylinders"], [CategoricalDomain(), Alias(ExpressionTransformer("X[0] % 2.0 > 0.0", dtype = numpy.int8), name = "odd(cylinders)", prefit = True)]), (["cylinders", "origin"], [MultiDomain([None, CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), OneHotEncoder()]), (["model_year"], [CategoricalDomain(), CastTransformer(str), ExpressionTransformer("'19' + X[0] + '-01-01'"), CastTransformer("datetime64[D]"), DaysSinceYearTransformer(1977), Binarizer(threshold = 0)], {"alias" : "bin(model_year, 1977)"}), (["model_year", "origin"], [ConcatTransformer("/"), OneHotEncoder(sparse = False), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]), (["weight", "displacement"], [ContinuousDomain(), ExpressionTransformer("(X[0] / X[1]) + 0.5", dtype = numpy.float64)], {"alias" : "weight / displacement + 0.5"}), (["displacement", "horsepower", "weight", "acceleration"], [MultiDomain([None, ContinuousDomain(), None, ContinuousDomain()]), StandardScaler()]) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y, **fit_params) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"]) store_csv(mpg, name)
def test_sequence_transform(self): mapper = DataFrameMapper([ (["a"], [ExpressionTransformer("0 if pandas.isnull(X[0]) else X[0]"), Imputer(missing_values = 0)]) ]) X = DataFrame([[None], [1], [None]], columns = ["a"]) Xt = mapper.fit_transform(X) self.assertEqual([[1], [1], [1]], Xt.tolist())
def test_timedelta_seconds(self): X = DataFrame([["2018-12-31T23:59:59", "2019-01-01T00:00:00"], ["2019-01-01T03:30:03", "2019-01-01T00:00:00"]], columns = ["left", "right"]) mapper = DataFrameMapper([ (["left", "right"], [DateTimeDomain(), SecondsSinceYearTransformer(year = 2010), ExpressionTransformer("X[0] - X[1]")]) ]) Xt = mapper.fit_transform(X) self.assertEqual([[-1], [12603]], Xt.tolist())
def build_auto(regressor, name, **kwargs): mapper = DataFrameMapper([ (["cylinders"], CategoricalDomain()), (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]), (["model_year"], [CategoricalDomain(), Binarizer(threshold=77)], { "alias": "bin(model_year, 77)" }), # Pre/post 1973 oil crisis effects (["origin"], OneHotEncoder()), (["weight", "displacement"], ExpressionTransformer("(X[:, 0] / X[:, 1]) + 0.5"), { "alias": "weight / displacement + 0.5" }) ]) pipeline = Pipeline([("mapper", mapper), ("regressor", regressor)]) pipeline.fit(auto_X, auto_y) pipeline = make_pmml_pipeline(pipeline, auto_X.columns.values, auto_y.name) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac=0.05, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(auto_X.sample(frac=0.05, random_state=13)) customize(regressor, **kwargs) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"]) store_csv(mpg, name + ".csv")
def test_timedelta_days(self): X = DataFrame( [["2018-12-31", "2019-01-01"], ["2019-01-31", "2019-01-01"]], columns=["left", "right"]) pipeline = Pipeline([ ("union", FeatureUnion([ ("left_mapper", DataFrameMapper([ ("left", [DateDomain(), DaysSinceYearTransformer(year=2010)]) ])), ("right_mapper", DataFrameMapper([ ("right", [DateDomain(), DaysSinceYearTransformer(year=2010)]) ])) ])), ("expression", Alias(ExpressionTransformer("X[0] - X[1]"), "delta(left, right)", prefit=True)) ]) Xt = pipeline.fit_transform(X) self.assertEqual([[-1], [30]], Xt.tolist())
def build_auto(regressor, name, **pmml_options): cylinders_origin_mapping = { (8, 1) : "8/1", (6, 1) : "6/1", (4, 1) : "4/1", (6, 2) : "6/2", (4, 2) : "4/2", (6, 3) : "6/3", (4, 3) : "4/3" } mapper = DataFrameMapper([ (["cylinders", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), LabelBinarizer()]), (["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]), (["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {"alias" : "weight / displacement + 0.5"}) ]) pipeline = Pipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y) pipeline = make_pmml_pipeline(pipeline, auto_X.columns.values, auto_y.name) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name + ".csv")
def build_audit_na(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, predict_transformer = None, predict_proba_transformer = None, apply_transformer = None, **pmml_options): employment_mapping = { "CONSULTANT" : "PRIVATE", "PSFEDERAL" : "PUBLIC", "PSLOCAL" : "PUBLIC", "PSSTATE" : "PUBLIC", "SELFEMP" : "PRIVATE", "PRIVATE" : "PRIVATE" } gender_mapping = { "FEMALE" : 0.0, "MALE" : 1.0, "MISSING_VALUE" : 0.5 } mapper = DataFrameMapper( [(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999", dtype = int), name = "flag_missing(Age, -999)"), SimpleImputer(missing_values = -999, strategy = "constant", fill_value = 38)])] + [(["Age"], MissingIndicator())] + [(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), SimpleImputer(missing_values = -999, add_indicator = True)])] + [(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), SimpleImputer(strategy = "median", add_indicator = True)])] + [(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] + [([column], [CategoricalDomain(missing_values = None, missing_value_replacement = "N/A", with_data = False), SimpleImputer(missing_values = "N/A", strategy = "most_frequent"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), SimpleImputer(strategy = "constant"), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ], predict_transformer = predict_transformer, predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer) pipeline.fit(audit_na_X, audit_na_y, **fit_params) pipeline.configure(**pmml_options) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1) store_csv(adjusted, name)
def build_audit_na(classifier, name, with_proba = True, predict_proba_transformer = None, apply_transformer = None, **pmml_options): employment_mapping = { "CONSULTANT" : "PRIVATE", "PSFEDERAL" : "PUBLIC", "PSLOCAL" : "PUBLIC", "PSSTATE" : "PUBLIC", "SELFEMP" : "PRIVATE", "PRIVATE" : "PRIVATE" } gender_mapping = { "FEMALE" : 0, "MALE" : 1 } mapper = DataFrameMapper( [(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999"), name = "flag_missing(Age, -999)"), Imputer(missing_values = -999)])] + [(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), Imputer(missing_values = -999)])] + [(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), Imputer()])] + [(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] + [([column], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ], predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer) pipeline.fit(audit_na_X, audit_na_y) pipeline.configure(**pmml_options) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1) store_csv(adjusted, name + ".csv")
def build_auto(regressor, name, **kwargs): mapper = DataFrameMapper([ (["cylinders"], CategoricalDomain()), (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(missing_values = None), Imputer(missing_values = "NaN"), StandardScaler()]), (["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects (["origin"], OneHotEncoder()), (["weight", "displacement"], ExpressionTransformer("(X[:, 0] / X[:, 1]) + 0.5"), {"alias" : "weight / displacement + 0.5"}) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y) customize(regressor, **kwargs) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name + ".csv")
adjusted_proba = DataFrame( pipeline.predict_proba(audit_na_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns=["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis=1) store_csv(adjusted, name + ".csv") if "Audit" in datasets: build_audit_na(DecisionTreeClassifier(random_state=13, min_samples_leaf=5), "DecisionTreeAuditNA", apply_transformer=Alias( ExpressionTransformer("X[:, 0] - 1"), "eval(nodeId)", prefit=True), winner_id=True, class_extensions={"event": { "0": False, "1": True }}) build_audit_na( LogisticRegression(solver="newton-cg", max_iter=500), "LogisticRegressionAuditNA", predict_proba_transformer=Alias( ExpressionTransformer("numpy.where(X[:, 1] > 0.75, 1, 0)"), name="eval(probability(1))", prefit=True))
pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1) store_csv(adjusted, name) if "Audit" in datasets: build_audit_na(DecisionTreeClassifier(min_samples_leaf = 5, random_state = 13), "DecisionTreeAuditNA", apply_transformer = Alias(ExpressionTransformer("X[0] - 1"), "eval(nodeId)", prefit = True), winner_id = True, class_extensions = {"event" : {"0" : False, "1" : True}}) build_audit_na(LogisticRegression(multi_class = "ovr", solver = "newton-cg", max_iter = 500), "LogisticRegressionAuditNA", predict_proba_transformer = Alias(ExpressionTransformer("1 if X[1] > 0.75 else 0"), name = "eval(probability(1))", prefit = True)) build_audit_na(XGBClassifier(objective = "binary:logistic", random_state = 13), "XGBAuditNA", predict_params = {"ntree_limit" : 71}, predict_proba_params = {"ntree_limit" : 71}, predict_transformer = Alias(ExpressionTransformer("X[0]"), name = "eval(Adjusted)", prefit = True), ntree_limit = 71) def build_audit_na_hist(classifier, name): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] + [([column], [CategoricalDomain(), PMMLLabelBinarizer()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ]) pipeline.fit(audit_na_X, audit_na_y) pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name)
h2o.init() mapper = DataFrameMapper([("Education", CategoricalDomain()), ("Employment", CategoricalDomain()), ("Gender", CategoricalDomain()), ("Marital", CategoricalDomain()), ("Occupation", CategoricalDomain()), ("Age", [ ContinuousDomain(), CutTransformer(bins=[17, 28, 37, 47, 83], labels=["q1", "q2", "q3", "q4"]) ]), ("Hours", ContinuousDomain()), ("Income", ContinuousDomain()), (["Hours", "Income"], Alias(ExpressionTransformer("X[1] / (X[0] * 52)"), "Hourly_Income"))]) classifier = H2ORandomForestEstimator(ntrees=17) predict_proba_transformer = Pipeline([ ("expression", ExpressionTransformer("X[1]")), ("cut", Alias(CutTransformer(bins=[0.0, 0.75, 0.90, 1.0], labels=["no", "maybe", "yes"]), "Decision", prefit=True)) ]) pipeline = PMMLPipeline([("local_mapper", mapper), ("uploader", H2OFrameCreator()), ("remote_classifier", classifier)],
def test_transform(self): transformer = ExpressionTransformer("X['a'] + X['b']", dtype=int) self.assertTrue(hasattr(transformer, "expr")) self.assertTrue(hasattr(transformer, "dtype")) X = DataFrame([[0.5, 0.5], [1.0, 2.0]], columns=["a", "b"]) Xt = transformer.fit_transform(X) self.assertIsInstance(Xt, numpy.ndarray) self.assertEqual(int, Xt.dtype) self.assertEqual([[1], [3]], Xt.tolist()) transformer = ExpressionTransformer("X['a'] + X['b']", dtype=float) Xt = transformer.fit_transform(X) self.assertIsInstance(Xt, numpy.ndarray) self.assertEqual(float, Xt.dtype) self.assertEqual([[1.0], [3.0]], Xt.tolist()) transformer = ExpressionTransformer("X[0] + X[1]") self.assertTrue(hasattr(transformer, "expr")) self.assertTrue(hasattr(transformer, "dtype")) self.assertIsNone(transformer.dtype) X = numpy.array([[0.5, 0.5], [1.0, 2.0]]) Xt = transformer.fit_transform(X) self.assertIsInstance(Xt, numpy.ndarray) self.assertEqual([[1], [3]], Xt.tolist()) transformer = ExpressionTransformer("X[0] - X[1]") self.assertEqual([[0.0], [-1.0]], transformer.fit_transform(X).tolist()) transformer = ExpressionTransformer("X[0] * X[1]") self.assertEqual([[0.25], [2.0]], transformer.fit_transform(X).tolist()) transformer = ExpressionTransformer("X[0] / X[1]") self.assertEqual([[1.0], [0.5]], transformer.fit_transform(X).tolist())
def test_transform(self): transformer = ExpressionTransformer("X[:, 0] + X[:, 1]") self.assertTrue(hasattr(transformer, "expr_")) X = numpy.array([[0.5, 0.5], [1.0, 2.0]]) Xt = transformer.fit_transform(X) self.assertEqual([1.0, 3.0], Xt.tolist()) transformer = ExpressionTransformer("X[:, 0] - X[:, 1]") Xt = transformer.fit_transform(X) self.assertEqual([0.0, -1.0], Xt.tolist()) transformer = ExpressionTransformer("X[:, 0] * X[:, 1]") Xt = transformer.fit_transform(X) self.assertEqual([0.25, 2.0], Xt.tolist()) transformer = ExpressionTransformer("X[:, 0] / X[:, 1]") Xt = transformer.fit_transform(X) self.assertEqual([1.0, 0.5], Xt.tolist())
("Employment", [CategoricalDomain(), LabelBinarizer(), SelectKBest(chi2, k=3)]), ("Occupation", [CategoricalDomain(), LabelBinarizer(), SelectKBest(chi2, k=3)]), ("Age", [ ContinuousDomain(), CutTransformer(bins=[17, 28, 37, 47, 83], labels=["q1", "q2", "q3", "q4"]), LabelBinarizer() ]), ("Hours", ContinuousDomain()), ("Income", ContinuousDomain()), (["Hours", "Income"], Alias(ExpressionTransformer("X[1] / (X[0] * 52)"), "Hourly_Income")) ]) interaction_mapper = DataFrameMapper([ ("Gender", [CategoricalDomain(), LabelBinarizer()]), ("Marital", [CategoricalDomain(), LabelBinarizer()]) ]) classifier = XGBClassifier() pipeline = PMMLPipeline([ ("mapper", FeatureUnion([("scalar_mapper", scalar_mapper), ("interaction", Pipeline([("interaction_mapper", interaction_mapper), ("polynomial", PolynomialFeatures())]))])), ("classifier", classifier) ])
if with_proba == True: adjusted_proba = DataFrame( pipeline.predict_proba(audit_na_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns=["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis=1) store_csv(adjusted, name + ".csv") if "Audit" in datasets: build_audit_na(DecisionTreeClassifier(random_state=13, min_samples_leaf=5), "DecisionTreeAuditNA", apply_transformer=Alias(ExpressionTransformer("X[0] - 1"), "eval(nodeId)", prefit=True), winner_id=True, class_extensions={"event": { "0": False, "1": True }}) build_audit_na(LogisticRegression(solver="newton-cg", max_iter=500), "LogisticRegressionAuditNA", predict_proba_transformer=Alias( ExpressionTransformer("1 if X[1] > 0.75 else 0"), name="eval(probability(1))", prefit=True)) build_audit_na(OptimalXGBClassifier(objective="binary:logistic", ntree_limit=11,
pipeline.fit(audit_na_X, audit_na_y, **fit_params) pipeline.configure(**pmml_options) pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1) store_csv(adjusted, name) if "Audit" in datasets: build_audit_na(DecisionTreeClassifier(min_samples_leaf = 5, random_state = 13), "DecisionTreeAuditNA", apply_transformer = Alias(ExpressionTransformer("X[0] - 1"), "eval(nodeId)", prefit = True), winner_id = True, class_extensions = {"event" : {"0" : False, "1" : True}}) build_audit_na(LogisticRegression(solver = "newton-cg", max_iter = 500), "LogisticRegressionAuditNA", predict_proba_transformer = Alias(ExpressionTransformer("1 if X[1] > 0.75 else 0"), name = "eval(probability(1))", prefit = True)) def build_audit_na_direct(classifier, name): mapper = DataFrameMapper([ (["Age", "Hours", "Income"], None), (["Employment", "Education", "Marital", "Occupation", "Gender"], OneHotEncoder()) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ]) pipeline.fit(audit_na_X, audit_na_y) pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"])
store_csv(df, "Apollo") def build_apollo(mapper, name): pipeline = PMMLPipeline([("mapper", mapper), ("classifier", DecisionTreeClassifier())]) pipeline.fit(df, df["success"]) store_pkl(pipeline, name) success = DataFrame(pipeline.predict(df), columns=["success"]) success_proba = DataFrame( pipeline.predict_proba(df), columns=["probability(false)", "probability(true)"]) success = pandas.concat((success, success_proba), axis=1) store_csv(success, name) mapper = DataFrameMapper([(["launch", "return"], [ DateTimeDomain(), DaysSinceYearTransformer(year=1968), ExpressionTransformer("X[1] - X[0]") ])]) build_apollo(mapper, "DurationInDaysApollo") mapper = DataFrameMapper([(["launch", "return"], [ DateTimeDomain(), SecondsSinceYearTransformer(year=1968), ExpressionTransformer("X[1] - X[0]") ])]) build_apollo(mapper, "DurationInSecondsApollo")
result = pandas.concat([result, probabilities], axis=1) store_csv(result, name) audit_data = load_audit("Audit") audit_feature_pipeline = Pipeline([ ("mapper", DataFrameMapper( [(cat_column, [CategoricalDomain(), LabelBinarizer()]) for cat_column in ["Employment", "Education", "Marital", "Occupation", "Gender"]] + [(cont_column, ContinuousDomain()) for cont_column in ["Age", "Income", "Hours"]] + [(["Income", "Hours"], Alias(ExpressionTransformer("X[0] / (X[1] * 52.0)"), "Hourly_Income", prefit=True))], df_out=True)) ]) build_classifier(audit_data, audit_feature_pipeline, 3, 7, "TPOTAudit") iris_data = load_iris("Iris") iris_feature_pipeline = Pipeline([ ("mapper", DataFrameMapper([(iris_data[0].columns.values, ContinuousDomain())])) ]) build_classifier(iris_data, iris_feature_pipeline, 7, 17, "TPOTIris")