Ejemplo n.º 1
0
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders"], [CategoricalDomain(), Alias(ExpressionTransformer("X[0] % 2.0 > 0.0", dtype = numpy.int8), name = "odd(cylinders)", prefit = True)]),
		(["cylinders", "origin"], [MultiDomain([None, CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), OneHotEncoder()]),
		(["model_year"], [CategoricalDomain(), CastTransformer(str), ExpressionTransformer("'19' + X[0] + '-01-01'"), CastTransformer("datetime64[D]"), DaysSinceYearTransformer(1977), Binarizer(threshold = 0)], {"alias" : "bin(model_year, 1977)"}),
		(["model_year", "origin"], [ConcatTransformer("/"), OneHotEncoder(sparse = False), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]),
		(["weight", "displacement"], [ContinuousDomain(), ExpressionTransformer("(X[0] / X[1]) + 0.5", dtype = numpy.float64)], {"alias" : "weight / displacement + 0.5"}),
		(["displacement", "horsepower", "weight", "acceleration"], [MultiDomain([None, ContinuousDomain(), None, ContinuousDomain()]), StandardScaler()])
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params)
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"])
	store_csv(mpg, name)
Ejemplo n.º 2
0
def build_auto(regressor, name, **kwargs):
    mapper = DataFrameMapper([
        (["cylinders"], CategoricalDomain()),
        (["displacement", "horsepower", "weight",
          "acceleration"], [ContinuousDomain(),
                            StandardScaler()]),
        (["model_year"], [CategoricalDomain(),
                          Binarizer(threshold=77)], {
                              "alias": "bin(model_year, 77)"
                          }),  # Pre/post 1973 oil crisis effects
        (["origin"], OneHotEncoder()),
        (["weight", "displacement"],
         ExpressionTransformer("(X[:, 0] / X[:, 1]) + 0.5"), {
             "alias": "weight / displacement + 0.5"
         })
    ])
    pipeline = Pipeline([("mapper", mapper), ("regressor", regressor)])
    pipeline.fit(auto_X, auto_y)
    pipeline = make_pmml_pipeline(pipeline, auto_X.columns.values, auto_y.name)
    if isinstance(regressor, XGBRegressor):
        pipeline.verify(auto_X.sample(frac=0.05, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(auto_X.sample(frac=0.05, random_state=13))
    customize(regressor, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"])
    store_csv(mpg, name + ".csv")
Ejemplo n.º 3
0
def build_audit_cat(classifier, name, with_proba=True, **fit_params):
    mapper = DataFrameMapper(
        [([column], ContinuousDomain())
         for column in ["Age", "Income"]] + [(["Hours"], [
             ContinuousDomain(),
             CutTransformer(bins=[0, 20, 40, 60, 80, 100],
                            labels=False,
                            right=False,
                            include_lowest=True)
         ])] + [(["Employment", "Education", "Marital", "Occupation"], [
             MultiDomain([
                 CategoricalDomain(),
                 CategoricalDomain(),
                 CategoricalDomain(),
                 CategoricalDomain()
             ]),
             OrdinalEncoder()
         ])] +
        [([column], [CategoricalDomain(), LabelEncoder()])
         for column in ["Gender", "Deductions"]])
    pipeline = Pipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_X, audit_y, **fit_params)
    pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values,
                                  audit_y.name)
    store_pkl(pipeline, name)
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    if with_proba == True:
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name)
Ejemplo n.º 4
0
def build_auto_na(regressor, name, predict_transformer = None, apply_transformer = None, **pmml_options):
	mapper = DataFrameMapper(
		[([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year"]] +
		[(["origin"], [CategoricalDomain(missing_values = -1), SimpleImputer(missing_values = -1, strategy = "most_frequent"), OneHotEncoder()])] +
		[(["acceleration"], [ContinuousDomain(missing_values = None), CutTransformer(bins = [5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25], labels = False), CategoricalImputer(), LabelBinarizer()])] +
		[(["displacement"], [ContinuousDomain(missing_values = None), SimpleImputer(), CutTransformer(bins = [0, 100, 200, 300, 400, 500], labels = ["XS", "S", "M", "L", "XL"]), LabelBinarizer()])] +
		[(["horsepower"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 50, high_value = 225), SimpleImputer(strategy = "median")])] +
		[(["weight"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 2000, high_value = 5000), SimpleImputer(strategy = "median")])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	], predict_transformer = predict_transformer, apply_transformer = apply_transformer)
	pipeline.fit(auto_na_X, auto_na_y)
	if isinstance(regressor, DecisionTreeRegressor):
		tree = regressor.tree_
		node_impurity = {node_idx : tree.impurity[node_idx] for node_idx in range(0, tree.node_count) if tree.impurity[node_idx] != 0.0}
		pmml_options["node_extensions"] = {regressor.criterion : node_impurity}
	pipeline.configure(**pmml_options)
	pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"])
	if isinstance(regressor, DecisionTreeRegressor):
		Xt = pipeline_transform(pipeline, auto_na_X)
		mpg_apply = DataFrame(regressor.apply(Xt), columns = ["nodeId"])
		mpg = pandas.concat((mpg, mpg_apply), axis = 1)
	store_csv(mpg, name)
Ejemplo n.º 5
0
 def test_fit_string_missing(self):
     domain = CategoricalDomain(missing_values="N/A",
                                missing_value_replacement="0")
     domain = clone(domain)
     self.assertEqual("N/A", domain.missing_values)
     self.assertEqual("0", domain.missing_value_replacement)
     self.assertFalse(domain._empty_fit())
     X = DataFrame(["1", "N/A", "3", "2", "N/A", "2"])
     Xt = domain.fit_transform(X)
     self.assertIsInstance(Xt, DataFrame)
     self.assertEqual(["1", "2", "3"], domain.data_.tolist())
     self.assertEqual({
         "totalFreq": 6,
         "missingFreq": 2,
         "invalidFreq": 0
     }, domain.counts_)
     self.assertEqual({
         "1": 1,
         "2": 2,
         "3": 1
     }, _value_count(domain.discr_stats_))
     self.assertEqual(["1", "0", "3", "2", "0", "2"], Xt.ix[:, 0].tolist())
     X = numpy.array(["N/A", "N/A"])
     Xt = domain.transform(X)
     self.assertEqual(["0", "0"], Xt.tolist())
Ejemplo n.º 6
0
def get_training_data(con):
    data = pd.read_sql("""select 
                        user_responses.id as id, 
                        drink_name as drink,  
                        user_responses.question_name as question_name, 
                        question_choices.choice as choice,
                        session_id
                        from user_responses inner join question_choices
                        on user_responses.question_choice = question_choices.id"""
                       , con=con
                       , index_col='id')
    
    print(data)
    
    data = data.pivot(index='session_id', columns='question_name', values=['choice', 'drink'])
    
    print(data)
    
    pipeline = PMMLPipeline([
            ("transformation", DataFrameMapper([
                (["hotdog"], [CategoricalDomain(), LabelBinarizer()])
                , (["tp"], [CategoricalDomain(), LabelBinarizer()])
                , (["personality"], [CategoricalDomain(), LabelBinarizer()])
            ])),
            ("classifier", GaussianNB())
        ])
    return data, pipeline
Ejemplo n.º 7
0
def build_auto(regressor, name, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(6, 3) : "6/3",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), LabelBinarizer()]),
		(["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects
		(["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]),
		(["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {"alias" : "weight / displacement + 0.5"})
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name)
Ejemplo n.º 8
0
def build_audit_cat(classifier, name, with_proba = True, fit_params = {}):
	marital_mapping = {
		"Married-spouse-absent" : "Married"
	}
	mapper = DataFrameMapper(
		[([column], ContinuousDomain(display_name = column)) for column in ["Age", "Income"]] +
		[(["Hours"], [ContinuousDomain(display_name = "Hours"), CutTransformer(bins = [0, 20, 40, 60, 80, 100], labels = False, right = False, include_lowest = True)])] +
		[(["Employment", "Education"], [MultiDomain([CategoricalDomain(display_name = "Employment"), CategoricalDomain(display_name = "Education")]), OrdinalEncoder(dtype = numpy.int_)])] +
		[(["Marital"], [CategoricalDomain(display_name = "Marital"), FilterLookupTransformer(marital_mapping), OrdinalEncoder(dtype = numpy.uint16)])] +
		[(["Occupation"], [CategoricalDomain(display_name = "Occupation"), OrdinalEncoder(dtype = numpy.float_)])] +
		[([column], [CategoricalDomain(display_name = column), LabelEncoder()]) for column in ["Gender", "Deductions"]]
	)
	pipeline = Pipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, audit_y, **fit_params)
	pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name)
	pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
Ejemplo n.º 9
0
 def test_fit_string(self):
     domain = clone(
         CategoricalDomain(with_data=False, with_statistics=False))
     self.assertTrue(domain._empty_fit())
     domain = clone(
         CategoricalDomain(missing_values=None, with_statistics=False))
     self.assertIsNone(domain.missing_values)
     self.assertEqual("as_is", domain.missing_value_treatment)
     self.assertIsNone(domain.missing_value_replacement)
     self.assertEqual("return_invalid", domain.invalid_value_treatment)
     self.assertIsNone(domain.invalid_value_replacement)
     self.assertFalse(domain._empty_fit())
     X = DataFrame(["1", None, "3", "2", None, "2"])
     Xt = domain.fit_transform(X)
     self.assertIsInstance(Xt, DataFrame)
     self.assertEqual(["1", "2", "3"], domain.data_.tolist())
     self.assertFalse(hasattr(domain, "counts_"))
     self.assertFalse(hasattr(domain, "discr_stats_"))
     self.assertEqual(["1", None, "3", "2", None, "2"], Xt.iloc[:,
                                                                0].tolist())
     X = numpy.array([None, None])
     Xt = domain.transform(X)
     self.assertEqual([None, None], Xt.tolist())
     X = numpy.array(["4"])
     with self.assertRaises(ValueError):
         domain.transform(X)
Ejemplo n.º 10
0
 def test_fit_int(self):
     domain = clone(
         CategoricalDomain(with_data=False, with_statistics=False))
     self.assertTrue(domain._empty_fit())
     domain = clone(
         CategoricalDomain(missing_value_treatment="as_value",
                           missing_value_replacement=1,
                           invalid_value_treatment="as_is",
                           invalid_value_replacement=0))
     self.assertIsNone(domain.missing_values)
     self.assertEqual("as_value", domain.missing_value_treatment)
     self.assertEqual(1, domain.missing_value_replacement)
     self.assertEqual("as_is", domain.invalid_value_treatment)
     self.assertEqual(0, domain.invalid_value_replacement)
     self.assertFalse(hasattr(domain, "data_"))
     self.assertFalse(hasattr(domain, "counts_"))
     self.assertFalse(hasattr(domain, "discr_stats_"))
     self.assertFalse(domain._empty_fit())
     X = DataFrame([1, None, 3, 2, None, 2])
     Xt = domain.fit_transform(X)
     self.assertIsInstance(Xt, DataFrame)
     self.assertEqual([1, 2, 3], domain.data_.tolist())
     self.assertEqual({
         "totalFreq": 6,
         "missingFreq": 2,
         "invalidFreq": 0
     }, domain.counts_)
     self.assertEqual({1: 1, 2: 2, 3: 1}, _value_count(domain.discr_stats_))
     self.assertEqual([1, 1, 3, 2, 1, 2], Xt[0].tolist())
     X = numpy.array([None, None])
     Xt = domain.transform(X)
     self.assertEqual([1, 1], Xt.tolist())
Ejemplo n.º 11
0
	def test_fit_string(self):
		domain = CategoricalDomain(with_statistics = False)
		self.assertEqual("as_is", domain.missing_value_treatment)
		self.assertFalse(hasattr(domain, "missing_value_replacement"))
		self.assertEqual("return_invalid", domain.invalid_value_treatment)
		X = numpy.array(["1", None, "3", "2", None, "2"])
		Xt = domain.fit_transform(X)
		self.assertEqual(numpy.array(["1", "2", "3"]).tolist(), domain.data_.tolist())
		self.assertFalse(hasattr(domain, "counts_"))
		self.assertFalse(hasattr(domain, "discr_stats_"))
		self.assertEqual(numpy.array(["1", None, "3", "2", None, "2"]).tolist(), Xt.tolist())
Ejemplo n.º 12
0
def get_sample_data(con):
    data = pd.read_sql("select * from sample_training_data"
                       , con=con
                       , index_col="id")
    
    pipeline = PMMLPipeline([
            ("transformation", DataFrameMapper([
                (["hotdog"], [CategoricalDomain(), LabelBinarizer()]),
                (["tp"], [CategoricalDomain(), LabelBinarizer()])
            ])),
            ("classifier", GaussianNB())
        ])
    return data, pipeline
Ejemplo n.º 13
0
	def test_fit_int_missing(self):
		domain = CategoricalDomain(missing_values = -1, missing_value_replacement = 0)
		self.assertEqual(-1, domain.missing_values)
		self.assertEqual(0, domain.missing_value_replacement)
		self.assertFalse(domain._empty_fit())
		X = DataFrame([1, -1, 3, 2, -1, 2])
		Xt = domain.fit_transform(X)
		self.assertIsInstance(Xt, DataFrame)
		self.assertEqual([1, 2, 3], domain.data_.tolist())
		self.assertEqual({"totalFreq" : 6, "missingFreq" : 2, "invalidFreq" : 0}, domain.counts_)
		self.assertEqual({1 : 1, 2 : 2, 3 : 1}, _value_count(domain.discr_stats_))
		self.assertEqual([1, 0, 3, 2, 0, 2], Xt[0].tolist())
		X = numpy.array([-1, -1])
		Xt = domain.transform(X)
		self.assertEqual([0, 0], Xt.tolist())
Ejemplo n.º 14
0
def build_auto(regressor, name):
	mapper = DataFrameMapper([
		(["cylinders"], CategoricalDomain()),
		(["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), Imputer(missing_values = "NaN"), StandardScaler()]),
		(["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects
		(["origin"], OneHotEncoder())
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y)
	store_pkl(pipeline, name + ".pkl")
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name + ".csv")
Ejemplo n.º 15
0
def build_visit(regressor, name):
	mapper = DataFrameMapper(
		[(["edlevel"], [CategoricalDomain(), OneHotEncoder()])] +
		[([bin_column], [CategoricalDomain(), OneHotEncoder()]) for bin_column in ["outwork", "female", "married", "kids", "self"]] +
		[(["age"], ContinuousDomain())] +
		[(["hhninc", "educ"], ContinuousDomain())]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	pipeline.fit(visit_X, visit_y)
	pipeline.verify(visit_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	docvis = DataFrame(pipeline.predict(visit_X), columns = ["docvis"])
	store_csv(docvis, name)
Ejemplo n.º 16
0
def build_auto_h2o(regressor, name):
    transformer = ColumnTransformer(
        [(column, CategoricalDomain(), [column])
         for column in ["cylinders", "model_year", "origin"]] +
        [(column, ContinuousDomain(), [column]) for column in
         ["displacement", "horsepower", "weight", "acceleration"]])
    pipeline = PMMLPipeline([("transformer", transformer),
                             ("uploader",
                              H2OFrameCreator(column_names=[
                                  "cylinders", "model_year", "origin",
                                  "displacement", "horsepower", "weight",
                                  "acceleration"
                              ],
                                              column_types=[
                                                  "enum", "enum", "enum",
                                                  "numeric", "numeric",
                                                  "numeric", "numeric"
                                              ])), ("regressor", regressor)])
    pipeline.fit(auto_X, H2OFrame(auto_y.to_frame()))
    pipeline.verify(auto_X.sample(frac=0.05, random_state=13))
    regressor = pipeline._final_estimator
    store_mojo(regressor, name + ".zip")
    store_pkl(pipeline, name + ".pkl")
    mpg = pipeline.predict(auto_X)
    mpg.set_names(["mpg"])
    store_csv(mpg.as_data_frame(), name + ".csv")
Ejemplo n.º 17
0
 def test_fit_int(self):
     domain = CategoricalDomain(missing_value_treatment="as_value",
                                missing_value_replacement=-999,
                                invalid_value_treatment="as_is")
     self.assertEqual("as_value", domain.missing_value_treatment)
     self.assertEqual(-999, domain.missing_value_replacement)
     self.assertEqual("as_is", domain.invalid_value_treatment)
     self.assertFalse(hasattr(domain, "data_"))
     X = DataFrame(numpy.array([1, None, 3, 2, None, 2]))
     Xt = domain.fit_transform(X)
     self.assertEqual(
         numpy.array([1, 2, 3]).tolist(), domain.data_.tolist())
     self.assertEqual(
         numpy.array([1, -999, 3, 2, -999, 2]).tolist(), Xt[0].tolist())
     X = numpy.array([None, None])
     Xt = domain.transform(X)
     self.assertEqual(numpy.array([-999, -999]).tolist(), Xt.tolist())
Ejemplo n.º 18
0
 def test_mapper(self):
     domain = CategoricalDomain()
     df = DataFrame([{"X": "2", "y": 2}, {"X": "1"}, {"X": "3"}])
     mapper = DataFrameMapper([("X", [domain, LabelBinarizer()]),
                               ("y", None)])
     mapper.fit_transform(df)
     self.assertEqual(
         numpy.array(["1", "2", "3"]).tolist(), domain.data_.tolist())
Ejemplo n.º 19
0
	def test_fit_int(self):
		domain = CategoricalDomain(missing_value_treatment = "as_value", missing_value_replacement = -999, invalid_value_treatment = "as_is")
		self.assertEqual("as_value", domain.missing_value_treatment)
		self.assertEqual(-999, domain.missing_value_replacement)
		self.assertEqual("as_is", domain.invalid_value_treatment)
		self.assertFalse(hasattr(domain, "data_"))
		self.assertFalse(hasattr(domain, "counts_"))
		self.assertFalse(hasattr(domain, "discr_stats_"))
		X = DataFrame(numpy.array([1, None, 3, 2, None, 2]))
		Xt = domain.fit_transform(X)
		self.assertEqual(numpy.array([1, 2, 3]).tolist(), domain.data_.tolist())
		self.assertEqual({"totalFreq" : 6, "missingFreq" : 2, "invalidFreq" : 0}, domain.counts_)
		self.assertEqual({1 : 1, 2 : 2, 3 : 1}, _value_count(domain.discr_stats_))
		self.assertEqual(numpy.array([1, -999, 3, 2, -999, 2]).tolist(), Xt[0].tolist())
		X = numpy.array([None, None]);
		Xt = domain.transform(X)
		self.assertEqual(numpy.array([-999, -999]).tolist(), Xt.tolist())
Ejemplo n.º 20
0
def build_audit_na(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, predict_transformer = None, predict_proba_transformer = None, apply_transformer = None, **pmml_options):
	employment_mapping = {
		"CONSULTANT" : "PRIVATE",
		"PSFEDERAL" : "PUBLIC",
		"PSLOCAL" : "PUBLIC",
		"PSSTATE" : "PUBLIC",
		"SELFEMP" : "PRIVATE",
		"PRIVATE" : "PRIVATE"
	}
	gender_mapping = {
		"FEMALE" : 0.0,
		"MALE" : 1.0,
		"MISSING_VALUE" : 0.5
	}
	mapper = DataFrameMapper(
		[(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999", dtype = int), name = "flag_missing(Age, -999)"), SimpleImputer(missing_values = -999, strategy = "constant", fill_value = 38)])] +
		[(["Age"], MissingIndicator())] +
		[(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), SimpleImputer(missing_values = -999, add_indicator = True)])] +
		[(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), SimpleImputer(strategy = "median", add_indicator = True)])] +
		[(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] +
		[([column], [CategoricalDomain(missing_values = None, missing_value_replacement = "N/A", with_data = False), SimpleImputer(missing_values = "N/A", strategy = "most_frequent"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] +
		[(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), SimpleImputer(strategy = "constant"), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	], predict_transformer = predict_transformer, predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer)
	pipeline.fit(audit_na_X, audit_na_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	if isinstance(classifier, DecisionTreeClassifier):
		Xt = pipeline_transform(pipeline, audit_na_X)
		adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"])
		adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1)
	store_csv(adjusted, name)
Ejemplo n.º 21
0
	def test_mapper(self):
		domain = CategoricalDomain()
		df = DataFrame([{"X" : "2", "y" : 2}, {"X" : "1"}, {"X" : "3"}])
		mapper = DataFrameMapper([
			("X", [domain, LabelBinarizer()]),
			("y", None)
		])
		mapper.fit_transform(df)
		self.assertEqual({"totalFreq" : 3, "missingFreq" : 0, "invalidFreq" : 0}, domain.counts_)
		self.assertEqual({"1" : 1, "2" : 1, "3" : 1}, _value_count(domain.discr_stats_))
		self.assertEqual(["1", "2", "3"], domain.data_.tolist())
Ejemplo n.º 22
0
	def test_fit_transform(self):
		domain = MultiDomain([ContinuousDomain(missing_value_replacement = 0.0), CategoricalDomain(missing_value_replacement = "zero")])
		X = DataFrame([[-1.0, "minus one"], [float("NaN"), None], [1.0, "one"]], columns = ["x1", "x2"])
		Xt = domain.fit_transform(X)
		self.assertTrue(isinstance(Xt, DataFrame))
		self.assertEqual([-1.0, 0.0, 1.0], Xt["x1"].tolist())
		self.assertEqual(["minus one", "zero", "one"], Xt["x2"].tolist())
		X = numpy.array([[float("NaN"), None]])
		Xt = domain.transform(X)
		self.assertTrue(isinstance(Xt, numpy.ndarray))
		self.assertTrue([0.0], Xt[:, 0].tolist())
		self.assertTrue(["zero"], Xt[:, 1].tolist())
Ejemplo n.º 23
0
def make_fit_lgbmlr(gbdt, lr):
    mapper = DataFrameMapper(
        [([cat_column],
          [CategoricalDomain(), LabelEncoder()])
         for cat_column in cat_columns] + [(cont_columns, ContinuousDomain())])
    classifier = GBDTLRClassifier(gbdt, lr)
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(df[cat_columns + cont_columns],
                 df[label_column],
                 classifier__gbdt__categorical_feature=range(
                     0, len(cat_columns)))
    return pipeline
Ejemplo n.º 24
0
def build_audit(classifier, name, with_proba = True, **kwargs):
	continuous_mapper = DataFrameMapper([
		("Age", ContinuousDomain()),
		("Income", ContinuousDomain()),
		("Hours", ContinuousDomain())
	])
	categorical_mapper = DataFrameMapper([
		("Employment", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(DecisionTreeClassifier(random_state = 13))))]),
		("Education", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(RandomForestClassifier(random_state = 13, n_estimators = 3)), threshold = "1.25 * mean"))]),
		("Marital", [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]),
		("Occupation", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectKBest(k = 3))]),
		("Gender", [CategoricalDomain(), LabelBinarizer(neg_label = -3, pos_label = 3)]),
		("Deductions", [CategoricalDomain(), LabelEncoder()]),
	])
	pipeline = PMMLPipeline([
		("union", FeatureUnion([
			("continuous", continuous_mapper),
			("categorical", Pipeline([
				("mapper", categorical_mapper),
				("polynomial", PolynomialFeatures())
			]))
		])),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, audit_y)
	customize(classifier, **kwargs)
	store_pkl(pipeline, name + ".pkl")
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	if(with_proba == True):
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name + ".csv")
Ejemplo n.º 25
0
def build_audit(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, **pmml_options):
	continuous_mapper = DataFrameMapper([
		(["Age", "Income", "Hours"], MultiDomain([ContinuousDomain() for i in range(0, 3)]))
	])
	categorical_mapper = DataFrameMapper([
		(["Employment"], [CategoricalDomain(), SubstringTransformer(0, 3), OneHotEncoder(drop = ["Vol"]), SelectFromModel(DecisionTreeClassifier(random_state = 13))]),
		(["Education"], [CategoricalDomain(), ReplaceTransformer("[aeiou]", ""), OneHotEncoder(drop = "first"), SelectFromModel(RandomForestClassifier(n_estimators = 3, random_state = 13), threshold = "1.25 * mean")]),
		(["Marital"], [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]),
		(["Occupation"], [CategoricalDomain(), LabelBinarizer(), SelectKBest(k = 3)]),
		(["Gender"], [CategoricalDomain(), MatchesTransformer("^Male$"), CastTransformer(int)]),
		(["Deductions"], [CategoricalDomain()]),
	])
	pipeline = Pipeline([
		("union", FeatureUnion([
			("continuous", continuous_mapper),
			("categorical", Pipeline([
				("mapper", categorical_mapper),
				("polynomial", PolynomialFeatures())
			]))
		])),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, audit_y, **fit_params)
	pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name)
	pipeline.configure(**pmml_options)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_X, **predict_params), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
Ejemplo n.º 26
0
def build_audit(classifier, name, with_proba=True, **kwargs):
    continuous_mapper = DataFrameMapper([
        (["Age", "Income",
          "Hours"], MultiDomain([ContinuousDomain() for i in range(0, 3)]))
    ])
    categorical_mapper = DataFrameMapper([
        (["Employment"], [
            CategoricalDomain(),
            LabelBinarizer(),
            SelectFromModel(DecisionTreeClassifier(random_state=13))
        ]),
        (["Education"], [
            CategoricalDomain(),
            LabelBinarizer(),
            SelectFromModel(RandomForestClassifier(random_state=13,
                                                   n_estimators=3),
                            threshold="1.25 * mean")
        ]),
        (["Marital"], [
            CategoricalDomain(),
            LabelBinarizer(neg_label=-1, pos_label=1),
            SelectKBest(k=3)
        ]),
        (["Occupation"],
         [CategoricalDomain(),
          LabelBinarizer(),
          SelectKBest(k=3)]),
        (["Gender"],
         [CategoricalDomain(),
          LabelBinarizer(neg_label=-3, pos_label=3)]),
        (["Deductions"], [CategoricalDomain(),
                          LabelEncoder()]),
    ])
    pipeline = Pipeline([
        ("union",
         FeatureUnion([("continuous", continuous_mapper),
                       ("categorical",
                        Pipeline([("mapper", categorical_mapper),
                                  ("polynomial", PolynomialFeatures())]))])),
        ("classifier", classifier)
    ])
    pipeline.fit(audit_X, audit_y)
    pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values,
                                  audit_y.name)
    if isinstance(classifier, XGBClassifier):
        pipeline.verify(audit_X.sample(frac=0.05, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(audit_X.sample(frac=0.05, random_state=13))
    customize(classifier, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    if with_proba == True:
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
Ejemplo n.º 27
0
def build_auto_na(regressor, name):
	mapper = DataFrameMapper(
		[([column], [ContinuousDomain(missing_values = None), Imputer()]) for column in ["acceleration", "displacement", "horsepower", "weight"]] +
		[([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year", "origin"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	pipeline.fit(auto_na_X, auto_na_y)
	store_pkl(pipeline, name + ".pkl")
	mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"])
	store_csv(mpg, name + ".csv")
Ejemplo n.º 28
0
def build_audit_na(classifier, name, with_proba=True):
    employment_mapping = {
        "Consultant": "Private",
        "PSFederal": "Public",
        "PSLocal": "Public",
        "PSState": "Public",
        "SelfEmp": "Private",
        "Private": "Private"
    }
    gender_mapping = {"Female": 0, "Male": 1}
    mapper = DataFrameMapper(
        [([column], [ContinuousDomain(missing_values=None),
                     Imputer()])
         for column in ["Age", "Income", "Hours"]] + [("Employment", [
             CategoricalDomain(missing_values=None),
             CategoricalImputer(),
             LookupTransformer(employment_mapping, "Other"),
             PMMLLabelBinarizer()
         ])] + [([column], [
             CategoricalDomain(missing_values=None),
             CategoricalImputer(),
             PMMLLabelBinarizer()
         ]) for column in ["Education", "Marital", "Occupation"]] +
        [("Gender", [
            CategoricalDomain(missing_values=None),
            CategoricalImputer(),
            LookupTransformer(gender_mapping, None)
        ])])
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_na_X, audit_na_y)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"])
    if (with_proba == True):
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_na_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
Ejemplo n.º 29
0
def build_auto_na_hist(regressor, name):
	mapper = DataFrameMapper(
		[([column], ContinuousDomain()) for column in ["displacement", "horsepower", "weight", "acceleration"]] +
		[([column], [CategoricalDomain(), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year", "origin"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	pipeline.fit(auto_na_X, auto_na_y)
	pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"])
	store_csv(mpg, name)
Ejemplo n.º 30
0
def sklearn_audit(classifier, name):
    pipeline = PMMLPipeline([
        ("mapper",
         DataFrameMapper(
             [([column],
               [CategoricalDomain(), OneHotEncoder()]) for column in
              ["Employment", "Education", "Marital", "Occupation", "Gender"]] +
             [([column], ContinuousDomain())
              for column in ["Age", "Income", "Hours"]])),
        ("classifier", classifier)
    ])
    pipeline.fit(audit_X, audit_y)
    pipeline.configure(compact=False)
    sklearn2pmml(pipeline, "pmml/" + name + ".pmml", with_repr=False)
Ejemplo n.º 31
0
	def test_fit_int(self):
		domain = CategoricalDomain()
		self.assertEqual("return_invalid", domain.invalid_value_treatment)
		self.assertFalse(hasattr(domain, "data_"))
		domain = domain.fit(numpy.array([1, 3, 2, 2]))
		self.assertEqual(numpy.array([1, 2, 3]).tolist(), domain.data_.tolist())
Ejemplo n.º 32
0
	def test_fit_string(self):
		domain = CategoricalDomain()
		domain = domain.fit(numpy.array(["1", None, "3", "2", None, "2"]))
		self.assertEqual(numpy.array(["1", "2", "3"]).tolist(), domain.data_.tolist())