Esempio n. 1
0
def build_auto(regressor, name, **pmml_options):
	cat_columns = ["cylinders", "model_year", "origin"]
	cont_columns = ["displacement", "horsepower", "weight", "acceleration"]
	if isinstance(regressor, LGBMRegressor):
		cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns]
	else:
		cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns]
	cont_mappings = [([cont_column], [cont_domain(name)]) for cont_column in cont_columns]
	mapper = DataFrameMapper(cat_mappings + cont_mappings)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	if isinstance(regressor, LGBMRegressor):
		pipeline.fit(auto_X, auto_y, regressor__categorical_feature = [0, 1, 2])
	elif isinstance(regressor, IsolationForest):
		pipeline.fit(auto_X)
	else:
		pipeline.fit(auto_X, auto_y)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(n = 3, random_state = 13))
	pipeline.configure(**pmml_options)
	store_pmml(pipeline, name)
	if isinstance(regressor, IsolationForest):
		decision_function = DataFrame(pipeline.decision_function(auto_X), columns = ["decisionFunction"])
		outlier = DataFrame(pipeline.predict(auto_X), columns = ["outlier"])
		outlier['outlier'] = outlier['outlier'].apply(lambda x: str(bool(x == -1)).lower())
		store_csv(pandas.concat((decision_function, outlier), axis = 1), name)
	else:
		mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
		store_csv(mpg, name)
Esempio n. 2
0
def submodel_evaluation(train_data,valid_data,model_list,\
                        category_feature,numeric_feature): 
    X_train = train_data[category_feature+numeric_feature] 
    y_train = train_data['user_type']
    X_valid = valid_data[category_feature+numeric_feature]
    y_valid = valid_data['user_type']
    
    pipeline_transformer = feature_union(category_feature,numeric_feature)    
    model_result_dict = {}
    for model in model_list:
        model_name = model.__class__.__name__
        print('model %s evaluation'%model_name)
        
        sub_model = PMMLPipeline([
            ('mapper',pipeline_transformer),
            ('classifier',model)
        ])
        sub_model.fit(X_train,y_train)
        predict_valid = sub_model.predict_proba(X_valid)[:,1]
        predict_label = sub_model.predict(X_valid)
        model_ks = plot_ks_curve(predict_valid,valid_data['user_type'])
        model_auc = roc_auc_score(y_valid, predict_valid)
        accuracy = metrics.accuracy_score(y_valid,predict_label)
        model_result_dict[model_name] = [model_ks,model_auc,accuracy]
    return model_result_dict
Esempio n. 3
0
def build_auto_na(regressor, name, predict_transformer = None, apply_transformer = None, **pmml_options):
	mapper = DataFrameMapper(
		[([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year"]] +
		[(["origin"], [CategoricalDomain(missing_values = -1), SimpleImputer(missing_values = -1, strategy = "most_frequent"), OneHotEncoder()])] +
		[(["acceleration"], [ContinuousDomain(missing_values = None), CutTransformer(bins = [5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25], labels = False), CategoricalImputer(), LabelBinarizer()])] +
		[(["displacement"], [ContinuousDomain(missing_values = None), SimpleImputer(), CutTransformer(bins = [0, 100, 200, 300, 400, 500], labels = ["XS", "S", "M", "L", "XL"]), LabelBinarizer()])] +
		[(["horsepower"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 50, high_value = 225), SimpleImputer(strategy = "median")])] +
		[(["weight"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 2000, high_value = 5000), SimpleImputer(strategy = "median")])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	], predict_transformer = predict_transformer, apply_transformer = apply_transformer)
	pipeline.fit(auto_na_X, auto_na_y)
	if isinstance(regressor, DecisionTreeRegressor):
		tree = regressor.tree_
		node_impurity = {node_idx : tree.impurity[node_idx] for node_idx in range(0, tree.node_count) if tree.impurity[node_idx] != 0.0}
		pmml_options["node_extensions"] = {regressor.criterion : node_impurity}
	pipeline.configure(**pmml_options)
	pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"])
	if isinstance(regressor, DecisionTreeRegressor):
		Xt = pipeline_transform(pipeline, auto_na_X)
		mpg_apply = DataFrame(regressor.apply(Xt), columns = ["nodeId"])
		mpg = pandas.concat((mpg, mpg_apply), axis = 1)
	store_csv(mpg, name)
Esempio n. 4
0
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders"], [CategoricalDomain(), Alias(ExpressionTransformer("X[0] % 2.0 > 0.0", dtype = numpy.int8), name = "odd(cylinders)", prefit = True)]),
		(["cylinders", "origin"], [MultiDomain([None, CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), OneHotEncoder()]),
		(["model_year"], [CategoricalDomain(), CastTransformer(str), ExpressionTransformer("'19' + X[0] + '-01-01'"), CastTransformer("datetime64[D]"), DaysSinceYearTransformer(1977), Binarizer(threshold = 0)], {"alias" : "bin(model_year, 1977)"}),
		(["model_year", "origin"], [ConcatTransformer("/"), OneHotEncoder(sparse = False), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]),
		(["weight", "displacement"], [ContinuousDomain(), ExpressionTransformer("(X[0] / X[1]) + 0.5", dtype = numpy.float64)], {"alias" : "weight / displacement + 0.5"}),
		(["displacement", "horsepower", "weight", "acceleration"], [MultiDomain([None, ContinuousDomain(), None, ContinuousDomain()]), StandardScaler()])
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params)
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"])
	store_csv(mpg, name)
Esempio n. 5
0
def build_iris(classifier, name, **pmml_options):
    cont_columns = [
        "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"
    ]
    cont_mappings = [([cont_column], ContinuousDomain())
                     for cont_column in cont_columns]
    mapper = DataFrameMapper(cont_mappings)
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(iris_X, iris_y)
    if isinstance(classifier, XGBClassifier):
        pipeline.verify(iris_X.sample(n=3, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(iris_X.sample(n=3, random_state=13))
    pipeline.configure(**pmml_options)
    store_pmml(pipeline, name)
    species = DataFrame(pipeline.predict(iris_X), columns=["Species"])
    species_proba = DataFrame(pipeline.predict_proba(iris_X),
                              columns=[
                                  "probability(setosa)",
                                  "probability(versicolor)",
                                  "probability(virginica)"
                              ])
    store_csv(pandas.concat((species, species_proba), axis=1), name)
Esempio n. 6
0
def build_auto(regressor, name, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(6, 3) : "6/3",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), LabelBinarizer()]),
		(["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects
		(["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]),
		(["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {"alias" : "weight / displacement + 0.5"})
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name)
Esempio n. 7
0
def build_sentiment(classifier, name, with_proba=True, **pmml_options):
    pipeline = PMMLPipeline([
        ("tf-idf",
         TfidfVectorizer(
             analyzer="word",
             preprocessor=None,
             strip_accents=None,
             lowercase=True,
             token_pattern=None,
             tokenizer=Splitter(),
             stop_words="english",
             ngram_range=(1, 2),
             norm=None,
             dtype=(numpy.float32 if isinstance(
                 classifier, RandomForestClassifier) else numpy.float64))),
        ("selector", SelectKBest(f_classif, k=500)), ("classifier", classifier)
    ])
    pipeline.fit(sentiment_X, sentiment_y)
    pipeline.configure(**pmml_options)
    store_pkl(pipeline, name + ".pkl")
    score = DataFrame(pipeline.predict(sentiment_X), columns=["Score"])
    if with_proba == True:
        score_proba = DataFrame(pipeline.predict_proba(sentiment_X),
                                columns=["probability(0)", "probability(1)"])
        score = pandas.concat((score, score_proba), axis=1)
    store_csv(score, name + ".csv")
Esempio n. 8
0
def build_auto_h2o(regressor, name):
    transformer = ColumnTransformer(
        [(column, CategoricalDomain(), [column])
         for column in ["cylinders", "model_year", "origin"]] +
        [(column, ContinuousDomain(), [column]) for column in
         ["displacement", "horsepower", "weight", "acceleration"]])
    pipeline = PMMLPipeline([("transformer", transformer),
                             ("uploader",
                              H2OFrameCreator(column_names=[
                                  "cylinders", "model_year", "origin",
                                  "displacement", "horsepower", "weight",
                                  "acceleration"
                              ],
                                              column_types=[
                                                  "enum", "enum", "enum",
                                                  "numeric", "numeric",
                                                  "numeric", "numeric"
                                              ])), ("regressor", regressor)])
    pipeline.fit(auto_X, H2OFrame(auto_y.to_frame()))
    pipeline.verify(auto_X.sample(frac=0.05, random_state=13))
    regressor = pipeline._final_estimator
    store_mojo(regressor, name + ".zip")
    store_pkl(pipeline, name + ".pkl")
    mpg = pipeline.predict(auto_X)
    mpg.set_names(["mpg"])
    store_csv(mpg.as_data_frame(), name + ".csv")
Esempio n. 9
0
def build_audit(classifier, name, **pmml_options):
	if isinstance(classifier, LGBMClassifier):
		cat_columns = ["Age", "Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"]
		cont_columns = ["Income", "Hours"]
	else:
		cat_columns = ["Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"]
		cont_columns = ["Age", "Income", "Hours"]
	if isinstance(classifier, LGBMClassifier):
		cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns]
	else:
		cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns]
	cont_mappings = [([cont_column], cont_domain(name)) for cont_column in cont_columns]
	mapper = DataFrameMapper(cat_mappings + cont_mappings)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	if isinstance(classifier, LGBMClassifier):
		pipeline.fit(audit_X, audit_y, classifier__categorical_feature = [0, 1, 2, 3, 4, 5])
	elif isinstance(classifier, XGBClassifier):
		if name == "XGBoostAuditNA":
			audit_X["Age"] = audit_X["Age"].astype(float)
		pipeline.fit(audit_X, audit_y)
	else:
		pipeline.fit(audit_X, audit_y)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_X.sample(n = 3, random_state = 13))
	pipeline.configure(**pmml_options)
	store_pmml(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
	store_csv(pandas.concat((adjusted, adjusted_proba), axis = 1), name)
Esempio n. 10
0
def build_iris_vec(classifier, name):
	pipeline = PMMLPipeline([
		("classifier", classifier)
	])
	pipeline.fit(iris_X, iris_y)
	store_pmml(pipeline, name)
	species = DataFrame(pipeline.predict(iris_X), columns = ["Species"])
	species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"])
	store_csv(pandas.concat((species, species_proba), axis = 1), name)
Esempio n. 11
0
def build_auto_isotonic(regressor, auto_isotonic_X, name):
	pipeline = PMMLPipeline([
		("regressor", regressor)
	])
	pipeline.fit(auto_isotonic_X, auto_y)
	pipeline.verify(auto_isotonic_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_isotonic_X), columns = ["mpg"])
	store_csv(mpg, name)
Esempio n. 12
0
def build_audit(mapper, classifier, name, **pmml_options):
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_X, audit_y)
    pipeline.configure(**pmml_options)
    store_pkl(pipeline, name)
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    adjusted_proba = DataFrame(pipeline.predict_proba(audit_X),
                               columns=["probability(0)", "probability(1)"])
    adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name)
Esempio n. 13
0
def build_apollo(mapper, name):
    pipeline = PMMLPipeline([("mapper", mapper),
                             ("classifier", DecisionTreeClassifier())])
    pipeline.fit(df, df["success"])
    store_pkl(pipeline, name)
    success = DataFrame(pipeline.predict(df), columns=["success"])
    success_proba = DataFrame(
        pipeline.predict_proba(df),
        columns=["probability(false)", "probability(true)"])
    success = pandas.concat((success, success_proba), axis=1)
    store_csv(success, name)
Esempio n. 14
0
def build_audit_dict(classifier, name, with_proba = True):
	pipeline = PMMLPipeline([
		("dict-transformer", DictVectorizer()),
		("classifier", classifier)
	])
	pipeline.fit(audit_dict_X, audit_y)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_dict_X), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_dict_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
Esempio n. 15
0
def build_auto_opt(regressor, name, fit_params = {}, **pmml_options):
	pipeline = PMMLPipeline([
		("regressor", regressor)
	])
	pipeline.fit(auto_X[auto_train_mask], auto_y[auto_train_mask], **fit_params)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name)
Esempio n. 16
0
	def test_predict_transform(self):
		predict_transformer = FeatureUnion([
			("identity", FunctionTransformer(None)),
			("log10", FunctionTransformer(numpy.log10))
		])
		pipeline = PMMLPipeline([("estimator", DummyRegressor())], predict_transformer = predict_transformer)
		X = DataFrame([[1, 0], [2, 0], [3, 0]], columns = ["X1", "X2"])
		y = Series([0.5, 1.0, 1.5], name = "y")
		pipeline.fit(X, y)
		y_pred = [1.0, 1.0, 1.0]
		y_predt = [1.0, 1.0, numpy.log10(1.0)]
		self.assertEqual(y_pred, pipeline.predict(X).tolist())
		self.assertEqual([y_predt for i in range(0, 3)], pipeline.predict_transform(X).tolist())
Esempio n. 17
0
def build_auto_na_hist(regressor, name):
	mapper = DataFrameMapper(
		[([column], ContinuousDomain()) for column in ["displacement", "horsepower", "weight", "acceleration"]] +
		[([column], [CategoricalDomain(), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year", "origin"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	pipeline.fit(auto_na_X, auto_na_y)
	pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"])
	store_csv(mpg, name)
Esempio n. 18
0
def build_iris_opt(classifier, name, fit_params = {}, **pmml_options):
	pipeline = PMMLPipeline([
		("classifier", classifier)
	])
	pipeline.fit(iris_X[iris_train_mask], iris_y[iris_train_mask], **fit_params)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13))
	store_pkl(pipeline, name)
	species = DataFrame(pipeline.predict(iris_X), columns = ["Species"])
	species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"])
	species = pandas.concat((species, species_proba), axis = 1)
	store_csv(species, name)
Esempio n. 19
0
def build_sentiment(classifier, transformer, name, with_proba = True, **pmml_options):
	pipeline = PMMLPipeline([
		("transformer", transformer),
		("densifier", DenseTransformer()),
		("selector", SelectKBest(f_classif, k = 500)),
		("classifier", classifier)
	])
	pipeline.fit(sentiment_X, sentiment_y)
	pipeline.configure(**pmml_options)
	store_pmml(pipeline, name)
	score = DataFrame(pipeline.predict(sentiment_X), columns = ["Score"])
	if with_proba:
		score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns = ["probability(0)", "probability(1)"])
		score = pandas.concat((score, score_proba), axis = 1)
	store_csv(score, name)
Esempio n. 20
0
def build_auto_na(regressor, name):
    mapper = DataFrameMapper(
        [([column], [
            CategoricalDomain(missing_values=-1),
            CategoricalImputer(missing_values=-1),
            PMMLLabelBinarizer()
        ]) for column in ["cylinders", "model_year"]] +
        [(["origin"], [CategoricalImputer(missing_values=-1),
                       OneHotEncoder()])] +
        [(["acceleration"], [
            ContinuousDomain(missing_values=None),
            CutTransformer(bins=[5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25],
                           labels=False),
            CategoricalImputer(),
            LabelBinarizer()
        ])] + [(["displacement"], [
            ContinuousDomain(missing_values=None),
            Imputer(),
            CutTransformer(bins=[0, 100, 200, 300, 400, 500],
                           labels=["XS", "S", "M", "L", "XL"]),
            LabelBinarizer()
        ])] + [(["horsepower"], [
            ContinuousDomain(missing_values=None,
                             outlier_treatment="as_extreme_values",
                             low_value=50,
                             high_value=225),
            Imputer()
        ])] + [(["weight"], [
            ContinuousDomain(missing_values=None,
                             outlier_treatment="as_extreme_values",
                             low_value=2000,
                             high_value=5000),
            Imputer()
        ])])
    pipeline = PMMLPipeline([("mapper", mapper), ("regressor", regressor)])
    pipeline.fit(auto_na_X, auto_na_y)
    if isinstance(regressor, DecisionTreeRegressor):
        tree = regressor.tree_
        node_impurity = {
            node_idx: tree.impurity[node_idx]
            for node_idx in range(0, tree.node_count)
            if tree.impurity[node_idx] != 0.0
        }
        pipeline.configure(
            node_extensions={regressor.criterion: node_impurity})
    store_pkl(pipeline, name + ".pkl")
    mpg = DataFrame(pipeline.predict(auto_na_X), columns=["mpg"])
    store_csv(mpg, name + ".csv")
Esempio n. 21
0
def build_visit(regressor, name):
	mapper = DataFrameMapper(
		[(["edlevel"], [CategoricalDomain(), OneHotEncoder()])] +
		[([bin_column], [CategoricalDomain(), OneHotEncoder()]) for bin_column in ["outwork", "female", "married", "kids", "self"]] +
		[(["age"], ContinuousDomain())] +
		[(["hhninc", "educ"], ContinuousDomain())]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	pipeline.fit(visit_X, visit_y)
	pipeline.verify(visit_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	docvis = DataFrame(pipeline.predict(visit_X), columns = ["docvis"])
	store_csv(docvis, name)
Esempio n. 22
0
def build_audit_na_direct(classifier, name):
	mapper = DataFrameMapper([
		(["Age", "Hours", "Income"], None),
		(["Employment", "Education", "Marital", "Occupation", "Gender"], OneHotEncoder())
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	pipeline.fit(audit_na_X, audit_na_y)
	pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"])
	adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"])
	adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
Esempio n. 23
0
def build_audit_na_hist(classifier, name):
	mapper = DataFrameMapper(
		[([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] +
		[([column], [CategoricalDomain(), PMMLLabelBinarizer()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	pipeline.fit(audit_na_X, audit_na_y)
	pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"])
	adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"])
	adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
Esempio n. 24
0
def build_versicolor_direct(classifier, name, with_proba=True, **pmml_options):
    transformer = ColumnTransformer(
        [("all", "passthrough", ["Petal.Length", "Petal.Width"])],
        remainder="drop")
    pipeline = PMMLPipeline([("transformer", transformer),
                             ("classifier", classifier)])
    pipeline.fit(versicolor_X, versicolor_y)
    pipeline.configure(**pmml_options)
    pipeline.verify(versicolor_X.sample(frac=0.10, random_state=13))
    store_pkl(pipeline, name)
    species = DataFrame(pipeline.predict(versicolor_X), columns=["Species"])
    if with_proba == True:
        species_proba = DataFrame(pipeline.predict_proba(versicolor_X),
                                  columns=["probability(0)", "probability(1)"])
        species = pandas.concat((species, species_proba), axis=1)
    store_csv(species, name)
Esempio n. 25
0
def build_audit_dict(classifier, name, with_proba = True):
	header = {
		"copyright" : "Copyright (c) 2021 Villu Ruusmann",
		"description" : "Integration test for dictionary (key-value mappings) input",
		"modelVersion" : "1.0.0"
	}
	pipeline = PMMLPipeline([
		("dict-transformer", DictVectorizer()),
		("classifier", classifier)
	], header = header)
	pipeline.fit(audit_dict_X, audit_y)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_dict_X), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_dict_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
Esempio n. 26
0
def build_sentiment(classifier, tokenizer, name, with_proba = True, **pmml_options):
	pipeline = PMMLPipeline([
		("union", FeatureUnion([
			("tf-idf", TfidfVectorizer(analyzer = "word", preprocessor = None, strip_accents = None, lowercase = True, tokenizer = tokenizer, stop_words = "english", ngram_range = (1, 2), norm = None, sublinear_tf = isinstance(classifier, LogisticRegressionCV), dtype = (numpy.float32 if isinstance(classifier, RandomForestClassifier) else numpy.float64))),
			("count", WordCountTransformer())
		])),
		("selector", SelectKBest(f_classif, k = 1000)),
		("classifier", classifier)
	])
	pipeline.fit(sentiment_X, sentiment_y)
	pipeline.configure(**pmml_options)
	store_pkl(pipeline, name)
	score = DataFrame(pipeline.predict(sentiment_X), columns = ["Score"])
	if with_proba == True:
		score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns = ["probability(0)", "probability(1)"])
		score = pandas.concat((score, score_proba), axis = 1)
	store_csv(score, name)
Esempio n. 27
0
def build_audit_h2o(classifier, name):
	mapper = DataFrameMapper(
		[([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] +
		[([column], CategoricalDomain()) for column in ["Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("uploader", H2OFrameCreator()),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(), column_types = ["categorical"]))
	pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13))
	classifier = pipeline._final_estimator
	store_mojo(classifier, name)
	store_pkl(pipeline, name)
	adjusted = pipeline.predict(audit_X)
	adjusted.set_names(["h2o(Adjusted)", "probability(0)", "probability(1)"])
	store_csv(adjusted.as_data_frame(), name)
Esempio n. 28
0
def build_audit_na(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, predict_transformer = None, predict_proba_transformer = None, apply_transformer = None, **pmml_options):
	employment_mapping = {
		"CONSULTANT" : "PRIVATE",
		"PSFEDERAL" : "PUBLIC",
		"PSLOCAL" : "PUBLIC",
		"PSSTATE" : "PUBLIC",
		"SELFEMP" : "PRIVATE",
		"PRIVATE" : "PRIVATE"
	}
	gender_mapping = {
		"FEMALE" : 0.0,
		"MALE" : 1.0,
		"MISSING_VALUE" : 0.5
	}
	mapper = DataFrameMapper(
		[(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999", dtype = int), name = "flag_missing(Age, -999)"), SimpleImputer(missing_values = -999, strategy = "constant", fill_value = 38)])] +
		[(["Age"], MissingIndicator())] +
		[(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), SimpleImputer(missing_values = -999, add_indicator = True)])] +
		[(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), SimpleImputer(strategy = "median", add_indicator = True)])] +
		[(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] +
		[([column], [CategoricalDomain(missing_values = None, missing_value_replacement = "N/A", with_data = False), SimpleImputer(missing_values = "N/A", strategy = "most_frequent"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] +
		[(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), SimpleImputer(strategy = "constant"), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	], predict_transformer = predict_transformer, predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer)
	pipeline.fit(audit_na_X, audit_na_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	if isinstance(classifier, DecisionTreeClassifier):
		Xt = pipeline_transform(pipeline, audit_na_X)
		adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"])
		adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1)
	store_csv(adjusted, name)
Esempio n. 29
0
def build_audit_na(classifier, name, with_proba = True, predict_proba_transformer = None, apply_transformer = None, **pmml_options):
	employment_mapping = {
		"CONSULTANT" : "PRIVATE",
		"PSFEDERAL" : "PUBLIC",
		"PSLOCAL" : "PUBLIC",
		"PSSTATE" : "PUBLIC",
		"SELFEMP" : "PRIVATE",
		"PRIVATE" : "PRIVATE"
	}
	gender_mapping = {
		"FEMALE" : 0,
		"MALE" : 1
	}
	mapper = DataFrameMapper(
		[(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999"), name = "flag_missing(Age, -999)"), Imputer(missing_values = -999)])] +
		[(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), Imputer(missing_values = -999)])] +
		[(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), Imputer()])] +
		[(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] +
		[([column], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] +
		[(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	], predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer)
	pipeline.fit(audit_na_X, audit_na_y)
	pipeline.configure(**pmml_options)
	store_pkl(pipeline, name + ".pkl")
	adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	if isinstance(classifier, DecisionTreeClassifier):
		Xt = pipeline_transform(pipeline, audit_na_X)
		adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"])
		adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1)
	store_csv(adjusted, name + ".csv")
Esempio n. 30
0
            SimpleImputer(strategy="median")), ("scaler", StandardScaler())])

categorical_features = ["age", "gender", "category"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(transformers=[("numeric", numeric_transformer,
                                                numeric_features),
                                               ("categorical",
                                                categorical_transformer,
                                                categorical_features)])

lr = LogisticRegression(multi_class="ovr")
lr.pmml_name_ = f"PMML-FraudDetection-{model_id}"

classifier = PMMLPipeline([("preprocessor", preprocessor), ("classifier", lr)])

# Build model
classifier.fit(train_x, train_y)

# Evaluate model
test_actual = test_y
test_predicted = classifier.predict(test_x)
(rmse, mae, r2) = evaluate_metrics(test_actual, test_predicted)
print("FraudDetection model:")
print("  RMSE: {}".format(rmse))
print("  MAE: {}".format(mae))
print("  R2: {}".format(r2))

sklearn2pmml(classifier, output_path, with_repr=True)
print("Fraud detection model exported")