def build_housing(regressor, name, with_kneighbors=False, **kwargs): mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())]) pipeline = Pipeline([("mapper", mapper), ("transformer-pipeline", Pipeline([ ("polynomial", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)), ("scaler", StandardScaler()), ("selector", SelectPercentile(score_func=f_regression, percentile=35)), ])), ("regressor", regressor)]) pipeline.fit(housing_X, housing_y) pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values, housing_y.name) pipeline.verify(housing_X.sample(frac=0.05, random_state=13)) customize(regressor, **kwargs) store_pkl(pipeline, name + ".pkl") medv = DataFrame(pipeline.predict(housing_X), columns=["MEDV"]) if (with_kneighbors == True): Xt = pipeline_transform(pipeline, housing_X) kneighbors = regressor.kneighbors(Xt) medv_ids = DataFrame(kneighbors[1] + 1, columns=[ "neighbor(" + str(x + 1) + ")" for x in range(regressor.n_neighbors) ]) medv = pandas.concat((medv, medv_ids), axis=1) store_csv(medv, name + ".csv")
def build_iris(classifier, name, with_proba=True, **kwargs): pipeline = Pipeline([ ("pipeline", Pipeline([("domain", ContinuousDomain()), ("transform", FeatureUnion([("normal_scale", FunctionTransformer(None)), ("log_scale", FunctionTransformer(numpy.log10))]))])), ("scaler", RobustScaler()), ("pca", IncrementalPCA(n_components=3, whiten=True)), ("classifier", classifier) ]) pipeline.fit(iris_X, iris_y) pipeline = make_pmml_pipeline(pipeline, iris_X.columns.values, iris_y.name) if isinstance(classifier, XGBClassifier): pipeline.verify(iris_X.sample(frac=0.10, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(iris_X.sample(frac=0.10, random_state=13)) customize(classifier, **kwargs) store_pkl(pipeline, name + ".pkl") species = DataFrame(pipeline.predict(iris_X), columns=["Species"]) if (with_proba == True): species_proba = DataFrame(pipeline.predict_proba(iris_X), columns=[ "probability(setosa)", "probability(versicolor)", "probability(virginica)" ]) species = pandas.concat((species, species_proba), axis=1) store_csv(species, name + ".csv")
def build_audit_cat(classifier, name, with_proba=True, **fit_params): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["Age", "Income"]] + [(["Hours"], [ ContinuousDomain(), CutTransformer(bins=[0, 20, 40, 60, 80, 100], labels=False, right=False, include_lowest=True) ])] + [([column], [CategoricalDomain(), LabelEncoder()]) for column in [ "Employment", "Education", "Marital", "Occupation", "Gender", "Deductions" ]]) pipeline = Pipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_X, audit_y, **fit_params) pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame( pipeline.predict_proba(audit_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
def build_audit(classifier, name, with_proba=True, **pmml_options): continuous_mapper = DataFrameMapper([ (["Age", "Income", "Hours"], MultiDomain([ContinuousDomain() for i in range(0, 3)])) ]) categorical_mapper = DataFrameMapper([ (["Employment"], [ CategoricalDomain(), LabelBinarizer(), SelectFromModel(DecisionTreeClassifier(random_state=13)) ]), (["Education"], [ CategoricalDomain(), LabelBinarizer(), SelectFromModel(RandomForestClassifier(random_state=13, n_estimators=3), threshold="1.25 * mean") ]), (["Marital"], [ CategoricalDomain(), LabelBinarizer(neg_label=-1, pos_label=1), SelectKBest(k=3) ]), (["Occupation"], [CategoricalDomain(), LabelBinarizer(), SelectKBest(k=3)]), (["Gender"], [CategoricalDomain(), LabelBinarizer(neg_label=-3, pos_label=3)]), (["Deductions"], [CategoricalDomain(), LabelEncoder()]), ]) pipeline = Pipeline([ ("union", FeatureUnion([("continuous", continuous_mapper), ("categorical", Pipeline([("mapper", categorical_mapper), ("polynomial", PolynomialFeatures())]))])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y) pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name) pipeline.configure(**pmml_options) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_X.sample(frac=0.05, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(audit_X.sample(frac=0.05, random_state=13)) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame( pipeline.predict_proba(audit_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
def save_model(model, current_time): ''' pickle and save as pmml ''' # pickle pickle.dump(model, open(f'outputs/model_{current_time}.sav', 'wb')) # pmml pmml_object = sklearn2pmml.make_pmml_pipeline(model) sklearn2pmml.sklearn2pmml(pmml_object, f'outputs/model_{current_time}.pmml.xml')
def build_regressor(data, name): X, y = data config = make_tpot_pmml_config(regressor_config_dict) del config["sklearn.neighbors.KNeighborsRegressor"] regressor = TPOTRegressor(generations = 3, population_size = 3, random_state = 13, config_dict = config, verbosity = 2) regressor.fit(X, y) pipeline = make_pmml_pipeline(regressor.fitted_pipeline_, active_fields = X.columns.values, target_fields = [y.name]) print(repr(pipeline)) store_pkl(pipeline, name) result = DataFrame(regressor.predict(X), columns = [y.name]) store_csv(result, name)
def build_audit(classifier, name, with_proba=True, **kwargs): continuous_mapper = DataFrameMapper([("Age", ContinuousDomain()), ("Income", ContinuousDomain()), ("Hours", ContinuousDomain())]) categorical_mapper = DataFrameMapper([ ("Employment", [ CategoricalDomain(), LabelBinarizer(), SelectFromModel(DecisionTreeClassifier(random_state=13)) ]), ("Education", [ CategoricalDomain(), LabelBinarizer(), SelectFromModel(RandomForestClassifier(random_state=13, n_estimators=3), threshold="1.25 * mean") ]), ("Marital", [ CategoricalDomain(), LabelBinarizer(neg_label=-1, pos_label=1), SelectKBest(k=3) ]), ("Occupation", [CategoricalDomain(), LabelBinarizer(), SelectKBest(k=3)]), ("Gender", [CategoricalDomain(), LabelBinarizer(neg_label=-3, pos_label=3)]), ("Deductions", [CategoricalDomain(), LabelEncoder()]), ]) pipeline = Pipeline([ ("union", FeatureUnion([("continuous", continuous_mapper), ("categorical", Pipeline([("mapper", categorical_mapper), ("polynomial", PolynomialFeatures())]))])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y) pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name) customize(classifier, **kwargs) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"]) if (with_proba == True): adjusted_proba = DataFrame( pipeline.predict_proba(audit_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
def build_ocsvm_housing(svm, name): mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())]) pipeline = Pipeline([("mapper", mapper), ("scaler", MaxAbsScaler()), ("estimator", svm)]) pipeline.fit(housing_X) pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values) store_pkl(pipeline, name) decisionFunction = DataFrame(pipeline.decision_function(housing_X), columns=["decisionFunction"]) outlier = DataFrame(pipeline.predict(housing_X) <= 0, columns=["outlier" ]).replace(True, "true").replace(False, "false") store_csv(pandas.concat([decisionFunction, outlier], axis=1), name)
def build_iforest_housing(iforest, name, **pmml_options): mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())]) pipeline = Pipeline([("mapper", mapper), ("estimator", iforest)]) pipeline.fit(housing_X) pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values) pipeline.configure(**pmml_options) store_pkl(pipeline, name) decisionFunction = DataFrame(pipeline.decision_function(housing_X), columns=["decisionFunction"]) outlier = DataFrame(pipeline.predict(housing_X) == -1, columns=["outlier" ]).replace(True, "true").replace(False, "false") store_csv(pandas.concat([decisionFunction, outlier], axis=1), name)
def build_classifier(data, name): X, y = data categories = pandas.unique(y) config = make_tpot_pmml_config(classifier_config_dict) del config["sklearn.neighbors.KNeighborsClassifier"] classifier = TPOTClassifier(generations = 1, population_size = 3, random_state = 13, config_dict = config, verbosity = 2) classifier.fit(X, y) pipeline = make_pmml_pipeline(classifier.fitted_pipeline_, active_fields = X.columns.values, target_fields = [y.name]) print(repr(pipeline)) store_pkl(pipeline, name) result = DataFrame(classifier.predict(X), columns = [y.name]) if(len(categories) > 0): probabilities = DataFrame(classifier.predict_proba(X), columns = ["probability(" + str(category) + ")" for category in categories]) result = pandas.concat([result, probabilities], axis = 1) store_csv(result, name)
def build_audit(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, **pmml_options): continuous_mapper = DataFrameMapper([ (["Age", "Income", "Hours"], MultiDomain([ContinuousDomain() for i in range(0, 3)])) ]) categorical_mapper = DataFrameMapper([ (["Employment"], [CategoricalDomain(), SubstringTransformer(0, 3), OneHotEncoder(drop = ["Vol"]), SelectFromModel(DecisionTreeClassifier(random_state = 13))]), (["Education"], [CategoricalDomain(), ReplaceTransformer("[aeiou]", ""), OneHotEncoder(drop = "first"), SelectFromModel(RandomForestClassifier(n_estimators = 3, random_state = 13), threshold = "1.25 * mean")]), (["Marital"], [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]), (["Occupation"], [CategoricalDomain(), LabelBinarizer(), SelectKBest(k = 3)]), (["Gender"], [CategoricalDomain(), MatchesTransformer("^Male$"), CastTransformer(int)]), (["Deductions"], [CategoricalDomain()]), ]) pipeline = Pipeline([ ("union", FeatureUnion([ ("continuous", continuous_mapper), ("categorical", Pipeline([ ("mapper", categorical_mapper), ("polynomial", PolynomialFeatures()) ])) ])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y, **fit_params) pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name) pipeline.configure(**pmml_options) if isinstance(classifier, EstimatorProxy): estimator = classifier.estimator if hasattr(estimator, "estimators_"): child_estimators = estimator.estimators_ if isinstance(child_estimators, numpy.ndarray): child_estimators = child_estimators.flatten().tolist() for child_estimator in child_estimators: child_estimator.pmml_feature_importances_ = child_estimator.feature_importances_ elif isinstance(classifier, XGBClassifier): classifier.pmml_feature_importances_ = classifier.feature_importances_ else: pass if isinstance(classifier, XGBClassifier): pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X, **predict_params), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def build_wheat(kmeans, name, with_affinity=True, **pmml_options): mapper = DataFrameMapper([(wheat_X.columns.values, ContinuousDomain())]) pipeline = Pipeline([("mapper", mapper), ("scaler", MinMaxScaler()), ("clusterer", kmeans)]) pipeline.fit(wheat_X) pipeline = make_pmml_pipeline(pipeline, wheat_X.columns.values) pipeline.configure(**pmml_options) store_pkl(pipeline, name + ".pkl") cluster = DataFrame(pipeline.predict(wheat_X), columns=["Cluster"]) if with_affinity == True: Xt = pipeline_transform(pipeline, wheat_X) affinity_0 = kmeans_distance(kmeans, 0, Xt) affinity_1 = kmeans_distance(kmeans, 1, Xt) affinity_2 = kmeans_distance(kmeans, 2, Xt) cluster_affinity = DataFrame( numpy.transpose([affinity_0, affinity_1, affinity_2]), columns=["affinity(0)", "affinity(1)", "affinity(2)"]) cluster = pandas.concat((cluster, cluster_affinity), axis=1) store_csv(cluster, name + ".csv")
def build_auto(regressor, name, **pmml_options): cylinders_origin_mapping = { (8, 1): "8/1", (6, 1): "6/1", (4, 1): "4/1", (6, 2): "6/2", (4, 2): "4/2", (6, 3): "6/3", (4, 3): "4/3" } mapper = DataFrameMapper([ (["cylinders", "origin"], [ MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value="other"), LabelBinarizer() ]), (["model_year"], [CategoricalDomain(), Binarizer(threshold=77)], { "alias": "bin(model_year, 77)" }), # Pre/post 1973 oil crisis effects (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]), (["weight", "displacement"], ExpressionTransformer("(X[:, 0] / X[:, 1]) + 0.5"), { "alias": "weight / displacement + 0.5" }) ]) pipeline = Pipeline([("mapper", mapper), ("regressor", regressor)]) pipeline.fit(auto_X, auto_y) pipeline = make_pmml_pipeline(pipeline, auto_X.columns.values, auto_y.name) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac=0.05, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(auto_X.sample(frac=0.05, random_state=13)) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"]) store_csv(mpg, name + ".csv")
def main(): parser = argparse.ArgumentParser( prog=__file__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-i', '--input_path', help='Path to the .csv with coefs and classes', required=True) args = parser.parse_args() df = pd.read_csv(args.input_path) df = df.drop(df.columns[0], axis=1) Y = df["target_class"] X = df.drop("target_class", axis=1) print(X.shape) clf = tree.DecisionTreeClassifier(criterion='entropy') X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3) clf = clf.fit(X_train, Y_train) Y_test_predicted = clf.predict(X_test) pipeline = make_pmml_pipeline(clf) sklearn2pmml(pipeline, pmml="../models/bankrot.pmml")
def build_versicolor(classifier, name, with_proba=True, **kwargs): mapper = DataFrameMapper([(versicolor_X.columns.values, [ContinuousDomain(), RobustScaler()])]) pipeline = Pipeline([("mapper", mapper), ("transformer-pipeline", Pipeline([("polynomial", PolynomialFeatures(degree=3)), ("selector", SelectKBest(k="all"))])), ("classifier", classifier)]) pipeline.fit(versicolor_X, versicolor_y) pipeline = make_pmml_pipeline(pipeline, versicolor_X.columns.values, versicolor_y.name) customize(classifier, **kwargs) store_pkl(pipeline, name + ".pkl") species = DataFrame(pipeline.predict(versicolor_X), columns=["Species"]) if (with_proba == True): species_proba = DataFrame(pipeline.predict_proba(versicolor_X), columns=["probability(0)", "probability(1)"]) species = pandas.concat((species, species_proba), axis=1) store_csv(species, name + ".csv")
def build_iris(classifier, name, with_proba=True, **pmml_options): pipeline = Pipeline([ ("pipeline", Pipeline([("mapper", DataFrameMapper([ (iris_X.columns.values, ContinuousDomain()), (["Sepal.Length", "Petal.Length"], Aggregator(function="mean")), (["Sepal.Width", "Petal.Width"], Aggregator(function="mean")) ])), ("transform", FeatureUnion([ ("normal_scale", FunctionTransformer(None)), ("log_scale", FunctionTransformer(numpy.log10)), ("power_scale", PowerFunctionTransformer(power=2)) ]))])), ("pca", IncrementalPCA(n_components=3, whiten=True)), ("classifier", classifier) ]) pipeline.fit(iris_X, iris_y) pipeline = make_pmml_pipeline(pipeline, iris_X.columns.values, iris_y.name) pipeline.configure(**pmml_options) if isinstance(classifier, XGBClassifier): pipeline.verify(iris_X.sample(frac=0.10, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(iris_X.sample(frac=0.10, random_state=13)) store_pkl(pipeline, name + ".pkl") species = DataFrame(pipeline.predict(iris_X), columns=["Species"]) if with_proba == True: species_proba = DataFrame(pipeline.predict_proba(iris_X), columns=[ "probability(setosa)", "probability(versicolor)", "probability(virginica)" ]) species = pandas.concat((species, species_proba), axis=1) store_csv(species, name + ".csv")
def build_audit_cat(classifier, name, with_proba = True, fit_params = {}): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["Age", "Income"]] + [(["Hours"], [ContinuousDomain(), CutTransformer(bins = [0, 20, 40, 60, 80, 100], labels = False, right = False, include_lowest = True)])] + [(["Employment", "Education"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), OrdinalEncoder(dtype = numpy.int_)])] + [(["Marital"], [CategoricalDomain(), OrdinalEncoder(dtype = numpy.uint16)])] + [(["Occupation"], [CategoricalDomain(), OrdinalEncoder(dtype = numpy.float_)])] + [([column], [CategoricalDomain(), LabelEncoder()]) for column in ["Gender", "Deductions"]] ) pipeline = Pipeline([ ("mapper", mapper), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y, **fit_params) pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name) pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def build_regressor(data, feature_pipeline, generations, population_size, name): X, y = data Xt = feature_pipeline.fit_transform(X) Xt = Xt.astype(float) config = make_tpot_pmml_config(regressor_config_dict) config = filter_config(config) del config["sklearn.neighbors.KNeighborsRegressor"] regressor = TPOTRegressor(generations=generations, population_size=population_size, random_state=13, config_dict=config, verbosity=2) regressor.fit(Xt, y) pipeline = Pipeline(steps=feature_pipeline.steps + regressor.fitted_pipeline_.steps) pipeline = make_pmml_pipeline(pipeline, active_fields=X.columns.values, target_fields=[y.name]) print(repr(pipeline)) store_pkl(pipeline, name) result = DataFrame(regressor.predict(Xt), columns=[y.name]) store_csv(result, name)
def build_auto(regressor, name, **kwargs): mapper = DataFrameMapper([ (["cylinders"], CategoricalDomain()), (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]), (["model_year"], [CategoricalDomain(), Binarizer(threshold=77)], { "alias": "bin(model_year, 77)" }), # Pre/post 1973 oil crisis effects (["origin"], OneHotEncoder()), (["weight", "displacement"], ExpressionTransformer("(X[:, 0] / X[:, 1]) + 0.5"), { "alias": "weight / displacement + 0.5" }) ]) pipeline = Pipeline([("mapper", mapper), ("regressor", regressor)]) pipeline.fit(auto_X, auto_y) pipeline = make_pmml_pipeline(pipeline, auto_X.columns.values, auto_y.name) customize(regressor, **kwargs) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"]) store_csv(mpg, name + ".csv")
def build_versicolor(classifier, name, with_proba=True, **pmml_options): transformer = ColumnTransformer([("continuous_columns", Pipeline([("domain", ContinuousDomain()), ("scaler", RobustScaler())]), versicolor_X.columns.values)]) pipeline = Pipeline([("transformer", transformer), ("transformer-selector-pipeline", Pipeline([("polynomial", PolynomialFeatures(degree=3)), ("selector", SelectKBest(k="all"))])), ("classifier", classifier)]) pipeline.fit(versicolor_X, versicolor_y) pipeline = make_pmml_pipeline(pipeline, versicolor_X.columns.values, versicolor_y.name) pipeline.configure(**pmml_options) pipeline.verify(versicolor_X.sample(frac=0.10, random_state=13)) store_pkl(pipeline, name + ".pkl") species = DataFrame(pipeline.predict(versicolor_X), columns=["Species"]) if with_proba == True: species_proba = DataFrame(pipeline.predict_proba(versicolor_X), columns=["probability(0)", "probability(1)"]) species = pandas.concat((species, species_proba), axis=1) store_csv(species, name + ".csv")
def build_classifier(data, feature_pipeline, generations, population_size, name): X, y = data Xt = feature_pipeline.fit_transform(X) Xt = Xt.astype(float) categories = pandas.unique(y) config = make_tpot_pmml_config(classifier_config_dict) config = filter_config(config) del config[ "sklearn.naive_bayes.GaussianNB"] # Does not support nesting - see http://mantis.dmg.org/view.php?id=208 del config["sklearn.neighbors.KNeighborsClassifier"] del config[ "sklearn.svm.LinearSVC"] # Does not support classifier.predict_proba(Xt) del config["sklearn.tree.DecisionTreeClassifier"] classifier = TPOTClassifier(generations=generations, population_size=population_size, random_state=13, config_dict=config, verbosity=2) classifier.fit(Xt, y) pipeline = Pipeline(steps=feature_pipeline.steps + classifier.fitted_pipeline_.steps) pipeline = make_pmml_pipeline(pipeline, active_fields=X.columns.values, target_fields=[y.name]) pipeline.verify(X.sample(frac=0.05, random_state=13)) print(repr(pipeline)) store_pkl(pipeline, name) result = DataFrame(classifier.predict(Xt), columns=[y.name]) if (len(categories) > 0): probabilities = DataFrame(classifier.predict_proba(Xt), columns=[ "probability(" + str(category) + ")" for category in categories ]) result = pandas.concat([result, probabilities], axis=1) store_csv(result, name)
# coding:utf-8 import sklearn, sklearn.externals.joblib, sklearn_pandas, sklearn2pmml import sys reload(sys) sys.setdefaultencoding("utf-8") from sklearn2pmml import PMMLPipeline from sklearn.datasets import load_iris from sklearn import tree # from sklearn2pmml import make_pmml_pipeline, sklearn2pmml import pandas as pd pd.set_option('display.max_columns', None) from sklearn.externals import joblib obj = joblib.load("model.m") pmml_pipeline = make_pmml_pipeline(obj, active_fields=[ 'sbp', 'tobacco', 'ldl', 'adiposity', 'famhist', 'typea', 'obesity', 'alcohol', 'age' ], target_fields=['chd']) sklearn2pmml(pmml_pipeline, "result.pmml", with_repr=True, debug=True)
else: incorrect += 1 if predicted == "true": falsePositive += 1 else: falseNegative += 1 sensitivity = truePositive / (truePositive + falseNegative) specificity = trueNegative / (trueNegative + falsePositive) # Print results print(f"Results for model {type(model).__name__}") print(f"Correct: {correct}") print(f"Incorrect: {incorrect}") print(f"Accuracy: {100 * correct / total:.2f}%") print(f"True Positive Rate: {100 * sensitivity:.2f}%") print(f"True Negative Rate: {100 * specificity:.2f}%") from sklearn2pmml import sklearn2pmml from sklearn2pmml import make_pmml_pipeline # Export the trained model in PMML pipeline = make_pmml_pipeline( model, active_fields= ["category", "urgency", "targetPrice", "price"], target_fields= ["approval"] ) sklearn2pmml(pipeline, "order-approval.pmml")
norm=None, tokenizer=Splitter())), ( 'linear', SGDClassifier( # params )), ]) # Train the model pipeline.fit(train_data, train_labels) # Pack verification data and verify pipeline.verify(test_data) # Save the pipeline + model sklearn2pmml.sklearn2pmml(sklearn2pmml.make_pmml_pipeline( pipeline, active_fields=FEATURE_COLS, target_fields=TARGET_COL), 'model.pmml', with_repr=True, debug=True) # Measure the accuracy descision = pipeline.decision_function(test_data.sample(1)) jpmml_input_data = pd.read_csv("jpmml-test-input.csv", usecols=FEATURE_COLS) input_pred = pipeline.predict(jpmml_input_data.squeeze()) with numpy.printoptions(threshold=numpy.inf): print(input_pred) pred = pipeline.predict(test_data) print('Accuracy = {:.3f}'.format( sum(l == p for l, p in zip(test_labels, pred)) / len(test_labels)))
a = tree_to_rulescode(clf, dx_names, fc_file_name=False) model_eval = pd.DataFrame(columns=['data_catogary', 'KS', 'AUC']) model_eval.loc[0, :] = [ 'train_DATA', KS(clf, X_train, Y_train), auc(clf, X_train, Y_train) ] model_eval.loc[1, :] = [ 'test_DATA', KS(clf, X_test, Y_test), auc(clf, X_test, Y_test) ] model_eval = model_ev(clf=clf, df=df, dx_feacolname=dx_feature_col, df_result=model_eval, loan_month='type', bad_='bad') pipeline = sp.make_pmml_pipeline(clf, active_fields=dx_names, target_fields='is_bad') pipeline.configure(node_id=True, winner_id=True, numberofFields=True) #添加pmml文件的node_id pmml_path = path_v + '\\' + '_model.pmml' sklearn2pmml(pipeline, pmml_path, with_repr=True)