def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options): cylinders_origin_mapping = { (8, 1) : "8/1", (6, 1) : "6/1", (4, 1) : "4/1", (6, 2) : "6/2", (4, 2) : "4/2", (4, 3) : "4/3" } mapper = DataFrameMapper([ (["cylinders"], [CategoricalDomain(), Alias(ExpressionTransformer("X[0] % 2.0 > 0.0", dtype = numpy.int8), name = "odd(cylinders)", prefit = True)]), (["cylinders", "origin"], [MultiDomain([None, CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), OneHotEncoder()]), (["model_year"], [CategoricalDomain(), CastTransformer(str), ExpressionTransformer("'19' + X[0] + '-01-01'"), CastTransformer("datetime64[D]"), DaysSinceYearTransformer(1977), Binarizer(threshold = 0)], {"alias" : "bin(model_year, 1977)"}), (["model_year", "origin"], [ConcatTransformer("/"), OneHotEncoder(sparse = False), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]), (["weight", "displacement"], [ContinuousDomain(), ExpressionTransformer("(X[0] / X[1]) + 0.5", dtype = numpy.float64)], {"alias" : "weight / displacement + 0.5"}), (["displacement", "horsepower", "weight", "acceleration"], [MultiDomain([None, ContinuousDomain(), None, ContinuousDomain()]), StandardScaler()]) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y, **fit_params) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"]) store_csv(mpg, name)
def build_auto(regressor, name, **kwargs): mapper = DataFrameMapper([ (["cylinders"], CategoricalDomain()), (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]), (["model_year"], [CategoricalDomain(), Binarizer(threshold=77)], { "alias": "bin(model_year, 77)" }), # Pre/post 1973 oil crisis effects (["origin"], OneHotEncoder()), (["weight", "displacement"], ExpressionTransformer("(X[:, 0] / X[:, 1]) + 0.5"), { "alias": "weight / displacement + 0.5" }) ]) pipeline = Pipeline([("mapper", mapper), ("regressor", regressor)]) pipeline.fit(auto_X, auto_y) pipeline = make_pmml_pipeline(pipeline, auto_X.columns.values, auto_y.name) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac=0.05, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(auto_X.sample(frac=0.05, random_state=13)) customize(regressor, **kwargs) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"]) store_csv(mpg, name + ".csv")
def build_audit_cat(classifier, name, with_proba=True, **fit_params): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["Age", "Income"]] + [(["Hours"], [ ContinuousDomain(), CutTransformer(bins=[0, 20, 40, 60, 80, 100], labels=False, right=False, include_lowest=True) ])] + [(["Employment", "Education", "Marital", "Occupation"], [ MultiDomain([ CategoricalDomain(), CategoricalDomain(), CategoricalDomain(), CategoricalDomain() ]), OrdinalEncoder() ])] + [([column], [CategoricalDomain(), LabelEncoder()]) for column in ["Gender", "Deductions"]]) pipeline = Pipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_X, audit_y, **fit_params) pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame( pipeline.predict_proba(audit_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name)
def build_auto_na(regressor, name, predict_transformer = None, apply_transformer = None, **pmml_options): mapper = DataFrameMapper( [([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year"]] + [(["origin"], [CategoricalDomain(missing_values = -1), SimpleImputer(missing_values = -1, strategy = "most_frequent"), OneHotEncoder()])] + [(["acceleration"], [ContinuousDomain(missing_values = None), CutTransformer(bins = [5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25], labels = False), CategoricalImputer(), LabelBinarizer()])] + [(["displacement"], [ContinuousDomain(missing_values = None), SimpleImputer(), CutTransformer(bins = [0, 100, 200, 300, 400, 500], labels = ["XS", "S", "M", "L", "XL"]), LabelBinarizer()])] + [(["horsepower"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 50, high_value = 225), SimpleImputer(strategy = "median")])] + [(["weight"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 2000, high_value = 5000), SimpleImputer(strategy = "median")])] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ], predict_transformer = predict_transformer, apply_transformer = apply_transformer) pipeline.fit(auto_na_X, auto_na_y) if isinstance(regressor, DecisionTreeRegressor): tree = regressor.tree_ node_impurity = {node_idx : tree.impurity[node_idx] for node_idx in range(0, tree.node_count) if tree.impurity[node_idx] != 0.0} pmml_options["node_extensions"] = {regressor.criterion : node_impurity} pipeline.configure(**pmml_options) pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"]) if isinstance(regressor, DecisionTreeRegressor): Xt = pipeline_transform(pipeline, auto_na_X) mpg_apply = DataFrame(regressor.apply(Xt), columns = ["nodeId"]) mpg = pandas.concat((mpg, mpg_apply), axis = 1) store_csv(mpg, name)
def test_fit_string_missing(self): domain = CategoricalDomain(missing_values="N/A", missing_value_replacement="0") domain = clone(domain) self.assertEqual("N/A", domain.missing_values) self.assertEqual("0", domain.missing_value_replacement) self.assertFalse(domain._empty_fit()) X = DataFrame(["1", "N/A", "3", "2", "N/A", "2"]) Xt = domain.fit_transform(X) self.assertIsInstance(Xt, DataFrame) self.assertEqual(["1", "2", "3"], domain.data_.tolist()) self.assertEqual({ "totalFreq": 6, "missingFreq": 2, "invalidFreq": 0 }, domain.counts_) self.assertEqual({ "1": 1, "2": 2, "3": 1 }, _value_count(domain.discr_stats_)) self.assertEqual(["1", "0", "3", "2", "0", "2"], Xt.ix[:, 0].tolist()) X = numpy.array(["N/A", "N/A"]) Xt = domain.transform(X) self.assertEqual(["0", "0"], Xt.tolist())
def get_training_data(con): data = pd.read_sql("""select user_responses.id as id, drink_name as drink, user_responses.question_name as question_name, question_choices.choice as choice, session_id from user_responses inner join question_choices on user_responses.question_choice = question_choices.id""" , con=con , index_col='id') print(data) data = data.pivot(index='session_id', columns='question_name', values=['choice', 'drink']) print(data) pipeline = PMMLPipeline([ ("transformation", DataFrameMapper([ (["hotdog"], [CategoricalDomain(), LabelBinarizer()]) , (["tp"], [CategoricalDomain(), LabelBinarizer()]) , (["personality"], [CategoricalDomain(), LabelBinarizer()]) ])), ("classifier", GaussianNB()) ]) return data, pipeline
def build_auto(regressor, name, **pmml_options): cylinders_origin_mapping = { (8, 1) : "8/1", (6, 1) : "6/1", (4, 1) : "4/1", (6, 2) : "6/2", (4, 2) : "4/2", (6, 3) : "6/3", (4, 3) : "4/3" } mapper = DataFrameMapper([ (["cylinders", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), LabelBinarizer()]), (["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]), (["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {"alias" : "weight / displacement + 0.5"}) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name)
def build_audit_cat(classifier, name, with_proba = True, fit_params = {}): marital_mapping = { "Married-spouse-absent" : "Married" } mapper = DataFrameMapper( [([column], ContinuousDomain(display_name = column)) for column in ["Age", "Income"]] + [(["Hours"], [ContinuousDomain(display_name = "Hours"), CutTransformer(bins = [0, 20, 40, 60, 80, 100], labels = False, right = False, include_lowest = True)])] + [(["Employment", "Education"], [MultiDomain([CategoricalDomain(display_name = "Employment"), CategoricalDomain(display_name = "Education")]), OrdinalEncoder(dtype = numpy.int_)])] + [(["Marital"], [CategoricalDomain(display_name = "Marital"), FilterLookupTransformer(marital_mapping), OrdinalEncoder(dtype = numpy.uint16)])] + [(["Occupation"], [CategoricalDomain(display_name = "Occupation"), OrdinalEncoder(dtype = numpy.float_)])] + [([column], [CategoricalDomain(display_name = column), LabelEncoder()]) for column in ["Gender", "Deductions"]] ) pipeline = Pipeline([ ("mapper", mapper), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y, **fit_params) pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name) pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def test_fit_string(self): domain = clone( CategoricalDomain(with_data=False, with_statistics=False)) self.assertTrue(domain._empty_fit()) domain = clone( CategoricalDomain(missing_values=None, with_statistics=False)) self.assertIsNone(domain.missing_values) self.assertEqual("as_is", domain.missing_value_treatment) self.assertIsNone(domain.missing_value_replacement) self.assertEqual("return_invalid", domain.invalid_value_treatment) self.assertIsNone(domain.invalid_value_replacement) self.assertFalse(domain._empty_fit()) X = DataFrame(["1", None, "3", "2", None, "2"]) Xt = domain.fit_transform(X) self.assertIsInstance(Xt, DataFrame) self.assertEqual(["1", "2", "3"], domain.data_.tolist()) self.assertFalse(hasattr(domain, "counts_")) self.assertFalse(hasattr(domain, "discr_stats_")) self.assertEqual(["1", None, "3", "2", None, "2"], Xt.iloc[:, 0].tolist()) X = numpy.array([None, None]) Xt = domain.transform(X) self.assertEqual([None, None], Xt.tolist()) X = numpy.array(["4"]) with self.assertRaises(ValueError): domain.transform(X)
def test_fit_int(self): domain = clone( CategoricalDomain(with_data=False, with_statistics=False)) self.assertTrue(domain._empty_fit()) domain = clone( CategoricalDomain(missing_value_treatment="as_value", missing_value_replacement=1, invalid_value_treatment="as_is", invalid_value_replacement=0)) self.assertIsNone(domain.missing_values) self.assertEqual("as_value", domain.missing_value_treatment) self.assertEqual(1, domain.missing_value_replacement) self.assertEqual("as_is", domain.invalid_value_treatment) self.assertEqual(0, domain.invalid_value_replacement) self.assertFalse(hasattr(domain, "data_")) self.assertFalse(hasattr(domain, "counts_")) self.assertFalse(hasattr(domain, "discr_stats_")) self.assertFalse(domain._empty_fit()) X = DataFrame([1, None, 3, 2, None, 2]) Xt = domain.fit_transform(X) self.assertIsInstance(Xt, DataFrame) self.assertEqual([1, 2, 3], domain.data_.tolist()) self.assertEqual({ "totalFreq": 6, "missingFreq": 2, "invalidFreq": 0 }, domain.counts_) self.assertEqual({1: 1, 2: 2, 3: 1}, _value_count(domain.discr_stats_)) self.assertEqual([1, 1, 3, 2, 1, 2], Xt[0].tolist()) X = numpy.array([None, None]) Xt = domain.transform(X) self.assertEqual([1, 1], Xt.tolist())
def test_fit_string(self): domain = CategoricalDomain(with_statistics = False) self.assertEqual("as_is", domain.missing_value_treatment) self.assertFalse(hasattr(domain, "missing_value_replacement")) self.assertEqual("return_invalid", domain.invalid_value_treatment) X = numpy.array(["1", None, "3", "2", None, "2"]) Xt = domain.fit_transform(X) self.assertEqual(numpy.array(["1", "2", "3"]).tolist(), domain.data_.tolist()) self.assertFalse(hasattr(domain, "counts_")) self.assertFalse(hasattr(domain, "discr_stats_")) self.assertEqual(numpy.array(["1", None, "3", "2", None, "2"]).tolist(), Xt.tolist())
def get_sample_data(con): data = pd.read_sql("select * from sample_training_data" , con=con , index_col="id") pipeline = PMMLPipeline([ ("transformation", DataFrameMapper([ (["hotdog"], [CategoricalDomain(), LabelBinarizer()]), (["tp"], [CategoricalDomain(), LabelBinarizer()]) ])), ("classifier", GaussianNB()) ]) return data, pipeline
def test_fit_int_missing(self): domain = CategoricalDomain(missing_values = -1, missing_value_replacement = 0) self.assertEqual(-1, domain.missing_values) self.assertEqual(0, domain.missing_value_replacement) self.assertFalse(domain._empty_fit()) X = DataFrame([1, -1, 3, 2, -1, 2]) Xt = domain.fit_transform(X) self.assertIsInstance(Xt, DataFrame) self.assertEqual([1, 2, 3], domain.data_.tolist()) self.assertEqual({"totalFreq" : 6, "missingFreq" : 2, "invalidFreq" : 0}, domain.counts_) self.assertEqual({1 : 1, 2 : 2, 3 : 1}, _value_count(domain.discr_stats_)) self.assertEqual([1, 0, 3, 2, 0, 2], Xt[0].tolist()) X = numpy.array([-1, -1]) Xt = domain.transform(X) self.assertEqual([0, 0], Xt.tolist())
def build_auto(regressor, name): mapper = DataFrameMapper([ (["cylinders"], CategoricalDomain()), (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), Imputer(missing_values = "NaN"), StandardScaler()]), (["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects (["origin"], OneHotEncoder()) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name + ".csv")
def build_visit(regressor, name): mapper = DataFrameMapper( [(["edlevel"], [CategoricalDomain(), OneHotEncoder()])] + [([bin_column], [CategoricalDomain(), OneHotEncoder()]) for bin_column in ["outwork", "female", "married", "kids", "self"]] + [(["age"], ContinuousDomain())] + [(["hhninc", "educ"], ContinuousDomain())] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(visit_X, visit_y) pipeline.verify(visit_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) docvis = DataFrame(pipeline.predict(visit_X), columns = ["docvis"]) store_csv(docvis, name)
def build_auto_h2o(regressor, name): transformer = ColumnTransformer( [(column, CategoricalDomain(), [column]) for column in ["cylinders", "model_year", "origin"]] + [(column, ContinuousDomain(), [column]) for column in ["displacement", "horsepower", "weight", "acceleration"]]) pipeline = PMMLPipeline([("transformer", transformer), ("uploader", H2OFrameCreator(column_names=[ "cylinders", "model_year", "origin", "displacement", "horsepower", "weight", "acceleration" ], column_types=[ "enum", "enum", "enum", "numeric", "numeric", "numeric", "numeric" ])), ("regressor", regressor)]) pipeline.fit(auto_X, H2OFrame(auto_y.to_frame())) pipeline.verify(auto_X.sample(frac=0.05, random_state=13)) regressor = pipeline._final_estimator store_mojo(regressor, name + ".zip") store_pkl(pipeline, name + ".pkl") mpg = pipeline.predict(auto_X) mpg.set_names(["mpg"]) store_csv(mpg.as_data_frame(), name + ".csv")
def test_fit_int(self): domain = CategoricalDomain(missing_value_treatment="as_value", missing_value_replacement=-999, invalid_value_treatment="as_is") self.assertEqual("as_value", domain.missing_value_treatment) self.assertEqual(-999, domain.missing_value_replacement) self.assertEqual("as_is", domain.invalid_value_treatment) self.assertFalse(hasattr(domain, "data_")) X = DataFrame(numpy.array([1, None, 3, 2, None, 2])) Xt = domain.fit_transform(X) self.assertEqual( numpy.array([1, 2, 3]).tolist(), domain.data_.tolist()) self.assertEqual( numpy.array([1, -999, 3, 2, -999, 2]).tolist(), Xt[0].tolist()) X = numpy.array([None, None]) Xt = domain.transform(X) self.assertEqual(numpy.array([-999, -999]).tolist(), Xt.tolist())
def test_mapper(self): domain = CategoricalDomain() df = DataFrame([{"X": "2", "y": 2}, {"X": "1"}, {"X": "3"}]) mapper = DataFrameMapper([("X", [domain, LabelBinarizer()]), ("y", None)]) mapper.fit_transform(df) self.assertEqual( numpy.array(["1", "2", "3"]).tolist(), domain.data_.tolist())
def test_fit_int(self): domain = CategoricalDomain(missing_value_treatment = "as_value", missing_value_replacement = -999, invalid_value_treatment = "as_is") self.assertEqual("as_value", domain.missing_value_treatment) self.assertEqual(-999, domain.missing_value_replacement) self.assertEqual("as_is", domain.invalid_value_treatment) self.assertFalse(hasattr(domain, "data_")) self.assertFalse(hasattr(domain, "counts_")) self.assertFalse(hasattr(domain, "discr_stats_")) X = DataFrame(numpy.array([1, None, 3, 2, None, 2])) Xt = domain.fit_transform(X) self.assertEqual(numpy.array([1, 2, 3]).tolist(), domain.data_.tolist()) self.assertEqual({"totalFreq" : 6, "missingFreq" : 2, "invalidFreq" : 0}, domain.counts_) self.assertEqual({1 : 1, 2 : 2, 3 : 1}, _value_count(domain.discr_stats_)) self.assertEqual(numpy.array([1, -999, 3, 2, -999, 2]).tolist(), Xt[0].tolist()) X = numpy.array([None, None]); Xt = domain.transform(X) self.assertEqual(numpy.array([-999, -999]).tolist(), Xt.tolist())
def build_audit_na(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, predict_transformer = None, predict_proba_transformer = None, apply_transformer = None, **pmml_options): employment_mapping = { "CONSULTANT" : "PRIVATE", "PSFEDERAL" : "PUBLIC", "PSLOCAL" : "PUBLIC", "PSSTATE" : "PUBLIC", "SELFEMP" : "PRIVATE", "PRIVATE" : "PRIVATE" } gender_mapping = { "FEMALE" : 0.0, "MALE" : 1.0, "MISSING_VALUE" : 0.5 } mapper = DataFrameMapper( [(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999", dtype = int), name = "flag_missing(Age, -999)"), SimpleImputer(missing_values = -999, strategy = "constant", fill_value = 38)])] + [(["Age"], MissingIndicator())] + [(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), SimpleImputer(missing_values = -999, add_indicator = True)])] + [(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), SimpleImputer(strategy = "median", add_indicator = True)])] + [(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] + [([column], [CategoricalDomain(missing_values = None, missing_value_replacement = "N/A", with_data = False), SimpleImputer(missing_values = "N/A", strategy = "most_frequent"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), SimpleImputer(strategy = "constant"), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ], predict_transformer = predict_transformer, predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer) pipeline.fit(audit_na_X, audit_na_y, **fit_params) pipeline.configure(**pmml_options) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1) store_csv(adjusted, name)
def test_mapper(self): domain = CategoricalDomain() df = DataFrame([{"X" : "2", "y" : 2}, {"X" : "1"}, {"X" : "3"}]) mapper = DataFrameMapper([ ("X", [domain, LabelBinarizer()]), ("y", None) ]) mapper.fit_transform(df) self.assertEqual({"totalFreq" : 3, "missingFreq" : 0, "invalidFreq" : 0}, domain.counts_) self.assertEqual({"1" : 1, "2" : 1, "3" : 1}, _value_count(domain.discr_stats_)) self.assertEqual(["1", "2", "3"], domain.data_.tolist())
def test_fit_transform(self): domain = MultiDomain([ContinuousDomain(missing_value_replacement = 0.0), CategoricalDomain(missing_value_replacement = "zero")]) X = DataFrame([[-1.0, "minus one"], [float("NaN"), None], [1.0, "one"]], columns = ["x1", "x2"]) Xt = domain.fit_transform(X) self.assertTrue(isinstance(Xt, DataFrame)) self.assertEqual([-1.0, 0.0, 1.0], Xt["x1"].tolist()) self.assertEqual(["minus one", "zero", "one"], Xt["x2"].tolist()) X = numpy.array([[float("NaN"), None]]) Xt = domain.transform(X) self.assertTrue(isinstance(Xt, numpy.ndarray)) self.assertTrue([0.0], Xt[:, 0].tolist()) self.assertTrue(["zero"], Xt[:, 1].tolist())
def make_fit_lgbmlr(gbdt, lr): mapper = DataFrameMapper( [([cat_column], [CategoricalDomain(), LabelEncoder()]) for cat_column in cat_columns] + [(cont_columns, ContinuousDomain())]) classifier = GBDTLRClassifier(gbdt, lr) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(df[cat_columns + cont_columns], df[label_column], classifier__gbdt__categorical_feature=range( 0, len(cat_columns))) return pipeline
def build_audit(classifier, name, with_proba = True, **kwargs): continuous_mapper = DataFrameMapper([ ("Age", ContinuousDomain()), ("Income", ContinuousDomain()), ("Hours", ContinuousDomain()) ]) categorical_mapper = DataFrameMapper([ ("Employment", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(DecisionTreeClassifier(random_state = 13))))]), ("Education", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(RandomForestClassifier(random_state = 13, n_estimators = 3)), threshold = "1.25 * mean"))]), ("Marital", [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]), ("Occupation", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectKBest(k = 3))]), ("Gender", [CategoricalDomain(), LabelBinarizer(neg_label = -3, pos_label = 3)]), ("Deductions", [CategoricalDomain(), LabelEncoder()]), ]) pipeline = PMMLPipeline([ ("union", FeatureUnion([ ("continuous", continuous_mapper), ("categorical", Pipeline([ ("mapper", categorical_mapper), ("polynomial", PolynomialFeatures()) ])) ])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y) customize(classifier, **kwargs) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"]) if(with_proba == True): adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name + ".csv")
def build_audit(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, **pmml_options): continuous_mapper = DataFrameMapper([ (["Age", "Income", "Hours"], MultiDomain([ContinuousDomain() for i in range(0, 3)])) ]) categorical_mapper = DataFrameMapper([ (["Employment"], [CategoricalDomain(), SubstringTransformer(0, 3), OneHotEncoder(drop = ["Vol"]), SelectFromModel(DecisionTreeClassifier(random_state = 13))]), (["Education"], [CategoricalDomain(), ReplaceTransformer("[aeiou]", ""), OneHotEncoder(drop = "first"), SelectFromModel(RandomForestClassifier(n_estimators = 3, random_state = 13), threshold = "1.25 * mean")]), (["Marital"], [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]), (["Occupation"], [CategoricalDomain(), LabelBinarizer(), SelectKBest(k = 3)]), (["Gender"], [CategoricalDomain(), MatchesTransformer("^Male$"), CastTransformer(int)]), (["Deductions"], [CategoricalDomain()]), ]) pipeline = Pipeline([ ("union", FeatureUnion([ ("continuous", continuous_mapper), ("categorical", Pipeline([ ("mapper", categorical_mapper), ("polynomial", PolynomialFeatures()) ])) ])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y, **fit_params) pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name) pipeline.configure(**pmml_options) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X, **predict_params), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def build_audit(classifier, name, with_proba=True, **kwargs): continuous_mapper = DataFrameMapper([ (["Age", "Income", "Hours"], MultiDomain([ContinuousDomain() for i in range(0, 3)])) ]) categorical_mapper = DataFrameMapper([ (["Employment"], [ CategoricalDomain(), LabelBinarizer(), SelectFromModel(DecisionTreeClassifier(random_state=13)) ]), (["Education"], [ CategoricalDomain(), LabelBinarizer(), SelectFromModel(RandomForestClassifier(random_state=13, n_estimators=3), threshold="1.25 * mean") ]), (["Marital"], [ CategoricalDomain(), LabelBinarizer(neg_label=-1, pos_label=1), SelectKBest(k=3) ]), (["Occupation"], [CategoricalDomain(), LabelBinarizer(), SelectKBest(k=3)]), (["Gender"], [CategoricalDomain(), LabelBinarizer(neg_label=-3, pos_label=3)]), (["Deductions"], [CategoricalDomain(), LabelEncoder()]), ]) pipeline = Pipeline([ ("union", FeatureUnion([("continuous", continuous_mapper), ("categorical", Pipeline([("mapper", categorical_mapper), ("polynomial", PolynomialFeatures())]))])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y) pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_X.sample(frac=0.05, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(audit_X.sample(frac=0.05, random_state=13)) customize(classifier, **kwargs) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame( pipeline.predict_proba(audit_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
def build_auto_na(regressor, name): mapper = DataFrameMapper( [([column], [ContinuousDomain(missing_values = None), Imputer()]) for column in ["acceleration", "displacement", "horsepower", "weight"]] + [([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year", "origin"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(auto_na_X, auto_na_y) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"]) store_csv(mpg, name + ".csv")
def build_audit_na(classifier, name, with_proba=True): employment_mapping = { "Consultant": "Private", "PSFederal": "Public", "PSLocal": "Public", "PSState": "Public", "SelfEmp": "Private", "Private": "Private" } gender_mapping = {"Female": 0, "Male": 1} mapper = DataFrameMapper( [([column], [ContinuousDomain(missing_values=None), Imputer()]) for column in ["Age", "Income", "Hours"]] + [("Employment", [ CategoricalDomain(missing_values=None), CategoricalImputer(), LookupTransformer(employment_mapping, "Other"), PMMLLabelBinarizer() ])] + [([column], [ CategoricalDomain(missing_values=None), CategoricalImputer(), PMMLLabelBinarizer() ]) for column in ["Education", "Marital", "Occupation"]] + [("Gender", [ CategoricalDomain(missing_values=None), CategoricalImputer(), LookupTransformer(gender_mapping, None) ])]) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_na_X, audit_na_y) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"]) if (with_proba == True): adjusted_proba = DataFrame( pipeline.predict_proba(audit_na_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
def build_auto_na_hist(regressor, name): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["displacement", "horsepower", "weight", "acceleration"]] + [([column], [CategoricalDomain(), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year", "origin"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(auto_na_X, auto_na_y) pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"]) store_csv(mpg, name)
def sklearn_audit(classifier, name): pipeline = PMMLPipeline([ ("mapper", DataFrameMapper( [([column], [CategoricalDomain(), OneHotEncoder()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]] + [([column], ContinuousDomain()) for column in ["Age", "Income", "Hours"]])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y) pipeline.configure(compact=False) sklearn2pmml(pipeline, "pmml/" + name + ".pmml", with_repr=False)
def test_fit_int(self): domain = CategoricalDomain() self.assertEqual("return_invalid", domain.invalid_value_treatment) self.assertFalse(hasattr(domain, "data_")) domain = domain.fit(numpy.array([1, 3, 2, 2])) self.assertEqual(numpy.array([1, 2, 3]).tolist(), domain.data_.tolist())
def test_fit_string(self): domain = CategoricalDomain() domain = domain.fit(numpy.array(["1", None, "3", "2", None, "2"])) self.assertEqual(numpy.array(["1", "2", "3"]).tolist(), domain.data_.tolist())