Beispiel #1
0
	def test_transform_string(self):
		mapping = {None : "null", "one" : "ein", "two" : "zwei", "three" : "drei"}
		with self.assertRaises(ValueError):
			LookupTransformer(mapping, None)
		mapping.pop(None)
		transformer = LookupTransformer(mapping, None)
		self.assertEqual([None, "ein"], transformer.transform(["zero", "one"]).tolist())
		self.assertEqual(["ein", "zwei", "drei"], transformer.transform(Series(numpy.array(["one", "two", "three"]))).tolist())
Beispiel #2
0
 def test_transform_string(self):
     mapping = {"one": "ein", "two": "zwei", "three": "drei"}
     transformer = LookupTransformer(mapping, None)
     self.assertEqual([None, "ein"],
                      transformer.transform(["zero", "one"]).tolist())
     self.assertEqual(["ein", "zwei", "drei"],
                      transformer.transform(
                          Series(numpy.array(["one", "two",
                                              "three"]))).tolist())
Beispiel #3
0
	def test_transform_float(self):
		mapping = {
			0.0 : math.cos(0.0),
			45.0 : math.cos(45.0),
			90.0 : math.cos(90.0)
		}
		transformer = LookupTransformer(mapping, float("NaN"))
		X = numpy.array([[0.0], [90.0]])
		self.assertEqual([[math.cos(0.0)], [math.cos(90.0)]], transformer.transform(X).tolist())
		X = numpy.array([180.0])
		self.assertTrue(math.isnan(transformer.transform(X)))
		X = Series([0.0, 45.0, 90.0])
		self.assertEqual([[math.cos(0.0)], [math.cos(45.0)], [math.cos(90.0)]], transformer.transform(X).tolist())
Beispiel #4
0
def build_audit_na(classifier, name, with_proba=True):
    employment_mapping = {
        "Consultant": "Private",
        "PSFederal": "Public",
        "PSLocal": "Public",
        "PSState": "Public",
        "SelfEmp": "Private",
        "Private": "Private"
    }
    gender_mapping = {"Female": 0, "Male": 1}
    mapper = DataFrameMapper(
        [([column], [ContinuousDomain(missing_values=None),
                     Imputer()])
         for column in ["Age", "Income", "Hours"]] + [("Employment", [
             CategoricalDomain(missing_values=None),
             CategoricalImputer(),
             LookupTransformer(employment_mapping, "Other"),
             PMMLLabelBinarizer()
         ])] + [([column], [
             CategoricalDomain(missing_values=None),
             CategoricalImputer(),
             PMMLLabelBinarizer()
         ]) for column in ["Education", "Marital", "Occupation"]] +
        [("Gender", [
            CategoricalDomain(missing_values=None),
            CategoricalImputer(),
            LookupTransformer(gender_mapping, None)
        ])])
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_na_X, audit_na_y)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"])
    if (with_proba == True):
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_na_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
Beispiel #5
0
def build_audit_na(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, predict_transformer = None, predict_proba_transformer = None, apply_transformer = None, **pmml_options):
	employment_mapping = {
		"CONSULTANT" : "PRIVATE",
		"PSFEDERAL" : "PUBLIC",
		"PSLOCAL" : "PUBLIC",
		"PSSTATE" : "PUBLIC",
		"SELFEMP" : "PRIVATE",
		"PRIVATE" : "PRIVATE"
	}
	gender_mapping = {
		"FEMALE" : 0.0,
		"MALE" : 1.0,
		"MISSING_VALUE" : 0.5
	}
	mapper = DataFrameMapper(
		[(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999", dtype = int), name = "flag_missing(Age, -999)"), SimpleImputer(missing_values = -999, strategy = "constant", fill_value = 38)])] +
		[(["Age"], MissingIndicator())] +
		[(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), SimpleImputer(missing_values = -999, add_indicator = True)])] +
		[(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), SimpleImputer(strategy = "median", add_indicator = True)])] +
		[(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] +
		[([column], [CategoricalDomain(missing_values = None, missing_value_replacement = "N/A", with_data = False), SimpleImputer(missing_values = "N/A", strategy = "most_frequent"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] +
		[(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), SimpleImputer(strategy = "constant"), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	], predict_transformer = predict_transformer, predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer)
	pipeline.fit(audit_na_X, audit_na_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	if isinstance(classifier, DecisionTreeClassifier):
		Xt = pipeline_transform(pipeline, audit_na_X)
		adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"])
		adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1)
	store_csv(adjusted, name)
Beispiel #6
0
def build_audit_na(classifier,
                   name,
                   with_proba=True,
                   predict_proba_transformer=None,
                   apply_transformer=None,
                   **pmml_options):
    employment_mapping = {
        "CONSULTANT": "PRIVATE",
        "PSFEDERAL": "PUBLIC",
        "PSLOCAL": "PUBLIC",
        "PSSTATE": "PUBLIC",
        "SELFEMP": "PRIVATE",
        "PRIVATE": "PRIVATE"
    }
    gender_mapping = {"FEMALE": 0, "MALE": 1}
    mapper = DataFrameMapper([(["Age"], [
        ContinuousDomain(missing_values=None, with_data=False),
        Alias(ExpressionTransformer(
            "numpy.where(pandas.notnull(X[:, 0]), X[:, 0], -999)"),
              name="flag_missing(Age, -999)"),
        Imputer(missing_values=-999)
    ])] + [(["Hours"], [
        ContinuousDomain(missing_values=None, with_data=False),
        Alias(ExpressionTransformer(
            "numpy.where(pandas.isnull(X[:, 0]), -999, X[:, 0])"),
              name="flag_missing(Hours, -999)"),
        Imputer(missing_values=-999)
    ])] + [(["Income"], [
        ContinuousDomain(missing_values=None,
                         outlier_treatment="as_missing_values",
                         low_value=5000,
                         high_value=200000,
                         with_data=False),
        Imputer()
    ])] + [(["Employment"], [
        CategoricalDomain(missing_values=None, with_data=False),
        CategoricalImputer(),
        StringNormalizer(function="uppercase"),
        LookupTransformer(employment_mapping, "OTHER"),
        StringNormalizer(function="lowercase"),
        PMMLLabelBinarizer()
    ])] + [([column], [
        CategoricalDomain(missing_values=None, with_data=False),
        CategoricalImputer(missing_values=None),
        StringNormalizer(function="lowercase"),
        PMMLLabelBinarizer()
    ]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [
        CategoricalDomain(missing_values=None, with_data=False),
        CategoricalImputer(),
        StringNormalizer(function="uppercase"),
        LookupTransformer(gender_mapping, None)
    ])])
    pipeline = PMMLPipeline(
        [("mapper", mapper), ("classifier", classifier)],
        predict_proba_transformer=predict_proba_transformer,
        apply_transformer=apply_transformer)
    pipeline.fit(audit_na_X, audit_na_y)
    pipeline.configure(**pmml_options)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"])
    if with_proba == True:
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_na_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    if isinstance(classifier, DecisionTreeClassifier):
        Xt = pipeline_transform(pipeline, audit_na_X)
        adjusted_apply = DataFrame(classifier.apply(Xt), columns=["nodeId"])
        adjusted = pandas.concat((adjusted, adjusted_apply), axis=1)
    store_csv(adjusted, name + ".csv")