def test_fit(self): selector = SelectKBest(score_func = f_regression, k = 1) selector_proxy = SelectorProxy(selector) self.assertFalse(hasattr(selector_proxy, "support_mask_")) selector_proxy.fit(numpy.array([[0, 0], [1.0, 2.0]]), numpy.array([0.5, 1.0])) self.assertEqual([0, 1], selector._get_support_mask().tolist()) self.assertEqual([0, 1], selector_proxy.support_mask_.tolist())
def build_audit(classifier, name, with_proba=True): mapper = DataFrameMapper([ ("Age", ContinuousDomain()), ("Employment", [ LabelBinarizer(), SelectFromModel(EstimatorProxy( DecisionTreeClassifier(random_state=13)), threshold="1.25 * mean") ]), ("Education", [ LabelBinarizer(), SelectorProxy( SelectFromModel(EstimatorProxy( RandomForestClassifier(random_state=13, n_estimators=3)), threshold="median")) ]), ("Marital", [LabelBinarizer(), SelectKBest(k=3)]), ("Occupation", [LabelBinarizer(), SelectorProxy(SelectKBest(k=3))]), ("Income", ContinuousDomain()), ("Gender", LabelEncoder()), ("Deductions", LabelEncoder()), ("Hours", ContinuousDomain()) ]) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_X, audit_y) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"]) if (with_proba == True): adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns=["probability_0", "probability_1"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
def build_audit(classifier, name, with_proba = True, **kwargs): continuous_mapper = DataFrameMapper([ ("Age", ContinuousDomain()), ("Income", ContinuousDomain()), ("Hours", ContinuousDomain()) ]) categorical_mapper = DataFrameMapper([ ("Employment", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(DecisionTreeClassifier(random_state = 13))))]), ("Education", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(RandomForestClassifier(random_state = 13, n_estimators = 3)), threshold = "1.25 * mean"))]), ("Marital", [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]), ("Occupation", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectKBest(k = 3))]), ("Gender", [CategoricalDomain(), LabelBinarizer(neg_label = -3, pos_label = 3)]), ("Deductions", [CategoricalDomain(), LabelEncoder()]), ]) pipeline = PMMLPipeline([ ("union", FeatureUnion([ ("continuous", continuous_mapper), ("categorical", Pipeline([ ("mapper", categorical_mapper), ("polynomial", PolynomialFeatures()) ])) ])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y) customize(classifier, **kwargs) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"]) if(with_proba == True): adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name + ".csv")
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options): cylinders_origin_mapping = { (8, 1) : "8/1", (6, 1) : "6/1", (4, 1) : "4/1", (6, 2) : "6/2", (4, 2) : "4/2", (4, 3) : "4/3" } mapper = DataFrameMapper([ (["cylinders"], [CategoricalDomain(), Alias(ExpressionTransformer("X[0] % 2.0 > 0.0", dtype = numpy.int8), name = "odd(cylinders)", prefit = True)]), (["cylinders", "origin"], [MultiDomain([None, CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), OneHotEncoder()]), (["model_year"], [CategoricalDomain(), CastTransformer(str), ExpressionTransformer("'19' + X[0] + '-01-01'"), CastTransformer("datetime64[D]"), DaysSinceYearTransformer(1977), Binarizer(threshold = 0)], {"alias" : "bin(model_year, 1977)"}), (["model_year", "origin"], [ConcatTransformer("/"), OneHotEncoder(sparse = False), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]), (["weight", "displacement"], [ContinuousDomain(), ExpressionTransformer("(X[0] / X[1]) + 0.5", dtype = numpy.float64)], {"alias" : "weight / displacement + 0.5"}), (["displacement", "horsepower", "weight", "acceleration"], [MultiDomain([None, ContinuousDomain(), None, ContinuousDomain()]), StandardScaler()]) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y, **fit_params) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"]) store_csv(mpg, name)
def build_housing(regressor, name, with_kneighbors=False): mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())]) pipeline = PMMLPipeline([ ("mapper", mapper), ("transformer-pipeline", Pipeline([ ("polynomial", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)), ("scaler", StandardScaler()), ("selector", SelectorProxy( SelectPercentile(score_func=f_regression, percentile=35))), ])), ("regressor", regressor) ]) pipeline.fit(housing_X, housing_y) store_pkl(pipeline, name + ".pkl") medv = DataFrame(pipeline.predict(housing_X), columns=["MEDV"]) if (with_kneighbors == True): Xt = pipeline_transform(pipeline, housing_X) kneighbors = regressor.kneighbors(Xt) medv_ids = DataFrame(kneighbors[1] + 1, columns=[ "neighbor(" + str(x + 1) + ")" for x in range(regressor.n_neighbors) ]) medv = pandas.concat((medv, medv_ids), axis=1) store_csv(medv, name + ".csv")
def build_sentiment(classifier, name, with_proba=True): pipeline = PMMLPipeline([ ("tf-idf", TfidfVectorizer( analyzer="word", preprocessor=None, strip_accents=None, lowercase=True, token_pattern=None, tokenizer=Splitter(), stop_words="english", ngram_range=(1, 2), norm=None, dtype=(numpy.float32 if isinstance( classifier, RandomForestClassifier) else numpy.float64))), ("selector", SelectorProxy(SelectPercentile(chi2, percentile=10))), ("classifier", classifier) ]) pipeline.fit(sentiment_X, sentiment_y) store_pkl(pipeline, name + ".pkl") score = DataFrame(pipeline.predict(sentiment_X), columns=["Score"]) if (with_proba == True): score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns=["probability(0)", "probability(1)"]) score = pandas.concat((score, score_proba), axis=1) store_csv(score, name + ".csv")
def build_auto(regressor, name, **pmml_options): cylinders_origin_mapping = { (8, 1): "8/1", (6, 1): "6/1", (4, 1): "4/1", (6, 2): "6/2", (4, 2): "4/2", (4, 3): "4/3" } mapper = DataFrameMapper([ (["cylinders", "origin"], [ MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value="other"), LabelBinarizer() ]), (["cylinders"], Alias(ExpressionTransformer("X[0] % 2.0 > 0.0"), name="odd(cylinders)", prefit=True)), (["model_year"], [CategoricalDomain(), Binarizer(threshold=77)], { "alias": "bin(model_year, 77)" }), # Pre/post 1973 oil crisis effects (["model_year", "origin"], [ MultiDomain([CategoricalDomain(), CategoricalDomain()]), ConcatTransformer("/"), LabelBinarizer(), SelectorProxy( SelectFromModel(RandomForestRegressor(random_state=13, n_estimators=3), threshold="1.25 * mean")) ]), (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]), (["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), { "alias": "weight / displacement + 0.5" }) ]) pipeline = PMMLPipeline([("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor)]) pipeline.fit(auto_X, auto_y) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac=0.05, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(auto_X.sample(frac=0.05, random_state=13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"]) store_csv(mpg, name)