Ejemplo n.º 1
0
	def test_fit(self):
		selector = SelectKBest(score_func = f_regression, k = 1)
		selector_proxy = SelectorProxy(selector)
		self.assertFalse(hasattr(selector_proxy, "support_mask_"))
		selector_proxy.fit(numpy.array([[0, 0], [1.0, 2.0]]), numpy.array([0.5, 1.0]))
		self.assertEqual([0, 1], selector._get_support_mask().tolist())
		self.assertEqual([0, 1], selector_proxy.support_mask_.tolist())
Ejemplo n.º 2
0
def build_audit(classifier, name, with_proba=True):
    mapper = DataFrameMapper([
        ("Age", ContinuousDomain()),
        ("Employment", [
            LabelBinarizer(),
            SelectFromModel(EstimatorProxy(
                DecisionTreeClassifier(random_state=13)),
                            threshold="1.25 * mean")
        ]),
        ("Education", [
            LabelBinarizer(),
            SelectorProxy(
                SelectFromModel(EstimatorProxy(
                    RandomForestClassifier(random_state=13, n_estimators=3)),
                                threshold="median"))
        ]), ("Marital", [LabelBinarizer(), SelectKBest(k=3)]),
        ("Occupation", [LabelBinarizer(),
                        SelectorProxy(SelectKBest(k=3))]),
        ("Income", ContinuousDomain()), ("Gender", LabelEncoder()),
        ("Deductions", LabelEncoder()), ("Hours", ContinuousDomain())
    ])
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_X, audit_y)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    if (with_proba == True):
        adjusted_proba = DataFrame(pipeline.predict_proba(audit_X),
                                   columns=["probability_0", "probability_1"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
Ejemplo n.º 3
0
def build_audit(classifier, name, with_proba = True, **kwargs):
	continuous_mapper = DataFrameMapper([
		("Age", ContinuousDomain()),
		("Income", ContinuousDomain()),
		("Hours", ContinuousDomain())
	])
	categorical_mapper = DataFrameMapper([
		("Employment", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(DecisionTreeClassifier(random_state = 13))))]),
		("Education", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(RandomForestClassifier(random_state = 13, n_estimators = 3)), threshold = "1.25 * mean"))]),
		("Marital", [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]),
		("Occupation", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectKBest(k = 3))]),
		("Gender", [CategoricalDomain(), LabelBinarizer(neg_label = -3, pos_label = 3)]),
		("Deductions", [CategoricalDomain(), LabelEncoder()]),
	])
	pipeline = PMMLPipeline([
		("union", FeatureUnion([
			("continuous", continuous_mapper),
			("categorical", Pipeline([
				("mapper", categorical_mapper),
				("polynomial", PolynomialFeatures())
			]))
		])),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, audit_y)
	customize(classifier, **kwargs)
	store_pkl(pipeline, name + ".pkl")
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	if(with_proba == True):
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name + ".csv")
Ejemplo n.º 4
0
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders"], [CategoricalDomain(), Alias(ExpressionTransformer("X[0] % 2.0 > 0.0", dtype = numpy.int8), name = "odd(cylinders)", prefit = True)]),
		(["cylinders", "origin"], [MultiDomain([None, CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), OneHotEncoder()]),
		(["model_year"], [CategoricalDomain(), CastTransformer(str), ExpressionTransformer("'19' + X[0] + '-01-01'"), CastTransformer("datetime64[D]"), DaysSinceYearTransformer(1977), Binarizer(threshold = 0)], {"alias" : "bin(model_year, 1977)"}),
		(["model_year", "origin"], [ConcatTransformer("/"), OneHotEncoder(sparse = False), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]),
		(["weight", "displacement"], [ContinuousDomain(), ExpressionTransformer("(X[0] / X[1]) + 0.5", dtype = numpy.float64)], {"alias" : "weight / displacement + 0.5"}),
		(["displacement", "horsepower", "weight", "acceleration"], [MultiDomain([None, ContinuousDomain(), None, ContinuousDomain()]), StandardScaler()])
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params)
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"])
	store_csv(mpg, name)
Ejemplo n.º 5
0
def build_housing(regressor, name, with_kneighbors=False):
    mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())])
    pipeline = PMMLPipeline([
        ("mapper", mapper),
        ("transformer-pipeline",
         Pipeline([
             ("polynomial",
              PolynomialFeatures(degree=2,
                                 interaction_only=True,
                                 include_bias=False)),
             ("scaler", StandardScaler()),
             ("selector",
              SelectorProxy(
                  SelectPercentile(score_func=f_regression, percentile=35))),
         ])), ("regressor", regressor)
    ])
    pipeline.fit(housing_X, housing_y)
    store_pkl(pipeline, name + ".pkl")
    medv = DataFrame(pipeline.predict(housing_X), columns=["MEDV"])
    if (with_kneighbors == True):
        Xt = pipeline_transform(pipeline, housing_X)
        kneighbors = regressor.kneighbors(Xt)
        medv_ids = DataFrame(kneighbors[1] + 1,
                             columns=[
                                 "neighbor(" + str(x + 1) + ")"
                                 for x in range(regressor.n_neighbors)
                             ])
        medv = pandas.concat((medv, medv_ids), axis=1)
    store_csv(medv, name + ".csv")
Ejemplo n.º 6
0
def build_sentiment(classifier, name, with_proba=True):
    pipeline = PMMLPipeline([
        ("tf-idf",
         TfidfVectorizer(
             analyzer="word",
             preprocessor=None,
             strip_accents=None,
             lowercase=True,
             token_pattern=None,
             tokenizer=Splitter(),
             stop_words="english",
             ngram_range=(1, 2),
             norm=None,
             dtype=(numpy.float32 if isinstance(
                 classifier, RandomForestClassifier) else numpy.float64))),
        ("selector", SelectorProxy(SelectPercentile(chi2, percentile=10))),
        ("classifier", classifier)
    ])
    pipeline.fit(sentiment_X, sentiment_y)
    store_pkl(pipeline, name + ".pkl")
    score = DataFrame(pipeline.predict(sentiment_X), columns=["Score"])
    if (with_proba == True):
        score_proba = DataFrame(pipeline.predict_proba(sentiment_X),
                                columns=["probability(0)", "probability(1)"])
        score = pandas.concat((score, score_proba), axis=1)
    store_csv(score, name + ".csv")
Ejemplo n.º 7
0
def build_auto(regressor, name, **pmml_options):
    cylinders_origin_mapping = {
        (8, 1): "8/1",
        (6, 1): "6/1",
        (4, 1): "4/1",
        (6, 2): "6/2",
        (4, 2): "4/2",
        (4, 3): "4/3"
    }
    mapper = DataFrameMapper([
        (["cylinders", "origin"], [
            MultiDomain([CategoricalDomain(),
                         CategoricalDomain()]),
            MultiLookupTransformer(cylinders_origin_mapping,
                                   default_value="other"),
            LabelBinarizer()
        ]),
        (["cylinders"],
         Alias(ExpressionTransformer("X[0] % 2.0 > 0.0"),
               name="odd(cylinders)",
               prefit=True)),
        (["model_year"], [CategoricalDomain(),
                          Binarizer(threshold=77)], {
                              "alias": "bin(model_year, 77)"
                          }),  # Pre/post 1973 oil crisis effects
        (["model_year", "origin"], [
            MultiDomain([CategoricalDomain(),
                         CategoricalDomain()]),
            ConcatTransformer("/"),
            LabelBinarizer(),
            SelectorProxy(
                SelectFromModel(RandomForestRegressor(random_state=13,
                                                      n_estimators=3),
                                threshold="1.25 * mean"))
        ]),
        (["displacement", "horsepower", "weight",
          "acceleration"], [ContinuousDomain(),
                            StandardScaler()]),
        (["weight",
          "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {
              "alias": "weight / displacement + 0.5"
          })
    ])
    pipeline = PMMLPipeline([("mapper", mapper), ("selector", SelectUnique()),
                             ("regressor", regressor)])
    pipeline.fit(auto_X, auto_y)
    pipeline.configure(**pmml_options)
    if isinstance(regressor, XGBRegressor):
        pipeline.verify(auto_X.sample(frac=0.05, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(auto_X.sample(frac=0.05, random_state=13))
    store_pkl(pipeline, name)
    mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"])
    store_csv(mpg, name)