def test_fit(self): regressor = DummyRegressor() regressor_proxy = EstimatorProxy(regressor, attr_names_=["constant_"]) self.assertFalse(hasattr(regressor_proxy, "constant_")) regressor_proxy.fit(numpy.array([[0], [0]]), numpy.array([0.0, 2.0])) self.assertEqual(1.0, regressor.constant_) self.assertEqual(1.0, regressor_proxy.constant_)
def build_audit(classifier, name, with_proba=True): mapper = DataFrameMapper([ ("Age", ContinuousDomain()), ("Employment", [ LabelBinarizer(), SelectFromModel(EstimatorProxy( DecisionTreeClassifier(random_state=13)), threshold="1.25 * mean") ]), ("Education", [ LabelBinarizer(), SelectorProxy( SelectFromModel(EstimatorProxy( RandomForestClassifier(random_state=13, n_estimators=3)), threshold="median")) ]), ("Marital", [LabelBinarizer(), SelectKBest(k=3)]), ("Occupation", [LabelBinarizer(), SelectorProxy(SelectKBest(k=3))]), ("Income", ContinuousDomain()), ("Gender", LabelEncoder()), ("Deductions", LabelEncoder()), ("Hours", ContinuousDomain()) ]) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_X, audit_y) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"]) if (with_proba == True): adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns=["probability_0", "probability_1"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
classifier.pmml_feature_importances_ = classifier.feature_importances_ else: pass if isinstance(classifier, XGBClassifier): pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X, **predict_params), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name) if "Audit" in datasets: build_audit(EstimatorProxy(DecisionTreeClassifier(min_samples_leaf = 2, random_state = 13)), "DecisionTreeAudit", compact = False) build_audit(BaggingClassifier(DecisionTreeClassifier(random_state = 13, min_samples_leaf = 5), n_estimators = 3, max_features = 0.5, random_state = 13), "DecisionTreeEnsembleAudit") build_audit(DummyClassifier(strategy = "most_frequent"), "DummyAudit") build_audit(EstimatorProxy(ExtraTreesClassifier(n_estimators = 10, min_samples_leaf = 5, random_state = 13)), "ExtraTreesAudit") build_audit(GBDTLRClassifier(RandomForestClassifier(n_estimators = 17, random_state = 13), LogisticRegression()), "GBDTLRAudit") build_audit(GBDTLRClassifier(XGBClassifier(n_estimators = 17, random_state = 13), LogisticRegression()), "XGBLRAudit") build_audit(GBDTLRClassifier(XGBRFClassifier(n_estimators = 7, max_depth = 6, random_state = 13), SGDClassifier(loss = "log", penalty = "elasticnet", random_state = 13)), "XGBRFLRAudit") build_audit(EstimatorProxy(GradientBoostingClassifier(loss = "exponential", init = None, random_state = 13)), "GradientBoostingAudit") build_audit(HistGradientBoostingClassifier(max_iter = 71, random_state = 13), "HistGradientBoostingAudit") build_audit(LGBMClassifier(objective = "binary", n_estimators = 37), "LGBMAudit", predict_params = {"num_iteration" : 17}, predict_proba_params = {"num_iteration" : 17}, num_iteration = 17) build_audit(LinearDiscriminantAnalysis(solver = "lsqr"), "LinearDiscriminantAnalysisAudit") build_audit(LinearSVC(penalty = "l1", dual = False, random_state = 13), "LinearSVCAudit", with_proba = False) build_audit(LogisticRegression(multi_class = "multinomial", solver = "newton-cg", max_iter = 500), "MultinomialLogisticRegressionAudit") build_audit(LogisticRegressionCV(cv = 3, multi_class = "ovr"), "OvRLogisticRegressionAudit") build_audit(BaggingClassifier(LogisticRegression(), n_estimators = 3, max_features = 0.5, random_state = 13), "LogisticRegressionEnsembleAudit") build_audit(GaussianNB(), "NaiveBayesAudit")
def build_audit(classifier, name, with_proba = True, **kwargs): continuous_mapper = DataFrameMapper([ ("Age", ContinuousDomain()), ("Income", ContinuousDomain()), ("Hours", ContinuousDomain()) ]) categorical_mapper = DataFrameMapper([ ("Employment", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(DecisionTreeClassifier(random_state = 13))))]), ("Education", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(RandomForestClassifier(random_state = 13, n_estimators = 3)), threshold = "1.25 * mean"))]), ("Marital", [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]), ("Occupation", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectKBest(k = 3))]), ("Gender", [CategoricalDomain(), LabelBinarizer(neg_label = -3, pos_label = 3)]), ("Deductions", [CategoricalDomain(), LabelEncoder()]), ]) pipeline = PMMLPipeline([ ("union", FeatureUnion([ ("continuous", continuous_mapper), ("categorical", Pipeline([ ("mapper", categorical_mapper), ("polynomial", PolynomialFeatures()) ])) ])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y) customize(classifier, **kwargs) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"]) if(with_proba == True): adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name + ".csv")