Exemple #1
0
def getFirstContent(dataUrl, modelUrl, modelName):
    training_data = load_files(dataUrl, encoding="utf-8")
    '''
    这是开始提取特征,这里的特征是词频统计。
    '''
    count_vect = CountVectorizer()

    X_train_counts = count_vect.fit_transform(training_data.data)

    '''
    这是开始提取特征,这里的特征是TFIDF特征。
    '''
    tfidf_transformer = TfidfTransformer()

    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    '''
    使用朴素贝叶斯分类,并做出简单的预测
    '''
    mnb_pipeline = PMMLPipeline([("classifier", LogisticRegression())])

    mnb_pipeline.fit(X_train_tfidf, training_data.target)
    
    //保存为pkl格式
    joblib.dump(mnb_pipeline, modelUrl + modelName)
    //保存为pmml格式
    sklearn2pmml(mnb_pipeline, modelUrl + modelName, with_repr = True)

    if (os.path.exists(modelUrl + modelName)):

        return "success"
    else:

        return "fail"
def convert_sklearn_to_pmml(model, pmml, feature_names=None, target_name=None):
    pipeline = PMMLPipeline([("regressor", model)])
    if feature_names is not None:
        pipeline.active_fields = feature_names
    if target_name is not None:
        pipeline.target_field = target_name
    sklearn2pmml(pipeline, pmml, with_repr=True, debug=True)
Exemple #3
0
def train(data_conf, model_conf, **kwargs):
    """Python train method called by AOA framework

    Parameters:
    data_conf (dict): The dataset metadata
    model_conf (dict): The model configuration to use

    Returns:
    None:No return

    """

    # load data & engineer
    iris_df = pd.read_csv(data_conf['location'])
    train, _ = train_test_split(iris_df, test_size=0.5, random_state=42)
    X = train.drop("species", 1)
    y = train['species']

    print("Starting training...")
    # fit model to training data
    classifier = PMMLPipeline([('classifier', RandomForestClassifier())])
    classifier.fit(X, y.values.ravel())
    print("Finished training")

    # export model artefacts to models/ folder
    if not os.path.exists('models'):
        os.makedirs('models')
    sklearn2pmml(classifier, "models/model.pmml")
    print("Saved trained model")
Exemple #4
0
def xgboost_to_pmml(data_X, data_y, par_file, save_model_as):
    """Save Xgboost Model to PMMl file.

    Parameters
    ----------
    data_X : pandas.DataFrame
        Variables of train data.
    date_y : pandas.DataFrame
        Lables of train data.
    par_file : str
        File path of model's parameters.
    save_model_as : str
        File path of PMML.

    Returns
    -------
    None
        Generate PMML file locally as `save_model_as` given.

    Examples
    --------
    >>> xgboost_to_pmml(data_x, data_y, "par.json", "model.pmml")
    """
    # Create Xgboost Model
    with open(par_file, "r") as f:
        par = json.load(f)
    xgb_now = XGBClassifier(**par)
    # Create Pipeline
    pipeline = PMMLPipeline([("classifier", xgb_now)])
    # Fit Model
    pipeline.fit(data_X, data_y)
    # Save Model
    sklearn2pmml(pipeline, save_model_as, with_repr=True)
def save_as_PMML(data, modelPath):
    """
    利用sklearn2pmml将模型存储为PMML
    """
    model = PMMLPipeline([("regressor", linear_model.LinearRegression())])
    model.fit(data[["x"]], data["y"])
    sklearn2pmml(model, "linear.pmml", with_repr=True)
Exemple #6
0
def train_and_save_model(data, model_path):
    """
    利用sklearn2pmml将模型存储为PMML
    """
    model = PMMLPipeline([("regressor", linear_model.LinearRegression())])
    model.fit(data[["x"]], data["y"])
    sklearn2pmml(model, model_path)
Exemple #7
0
def build_housing(regressor, name, with_kneighbors=False):
    mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())])
    pipeline = PMMLPipeline([
        ("mapper", mapper),
        ("transformer-pipeline",
         Pipeline([
             ("polynomial",
              PolynomialFeatures(degree=2,
                                 interaction_only=True,
                                 include_bias=False)),
             ("scaler", StandardScaler()),
             ("selector",
              SelectorProxy(
                  SelectPercentile(score_func=f_regression, percentile=35))),
         ])), ("regressor", regressor)
    ])
    pipeline.fit(housing_X, housing_y)
    store_pkl(pipeline, name + ".pkl")
    medv = DataFrame(pipeline.predict(housing_X), columns=["MEDV"])
    if (with_kneighbors == True):
        Xt = pipeline_transform(pipeline, housing_X)
        kneighbors = regressor.kneighbors(Xt)
        medv_ids = DataFrame(kneighbors[1] + 1,
                             columns=[
                                 "neighbor(" + str(x + 1) + ")"
                                 for x in range(regressor.n_neighbors)
                             ])
        medv = pandas.concat((medv, medv_ids), axis=1)
    store_csv(medv, name + ".csv")
Exemple #8
0
def save_model(model, feature_names, model_path, label_text="label"):
    p, extension = os.path.splitext(model_path)
    model.feature_names = feature_names
    pickle_path = p + ".pkl"

    if extension == ".pmml":
        try:
            from sklearn2pmml import sklearn2pmml, PMMLPipeline
        except ImportError:
            raise ImportError(
                "You need to install `sklearn2pmml` to store models in pmml format"
            )

        pipeline = PMMLPipeline([("model", model)])
        pipeline.target_field = label_text
        pipeline.active_fields = np.array(feature_names)
        sklearn2pmml(pipeline, model_path)

    elif extension == ".onnx":

        try:
            from skl2onnx import convert_sklearn
            from skl2onnx.common.data_types import FloatTensorType
            from skl2onnx.helpers.onnx_helper import select_model_inputs_outputs
            from onnx.onnx_pb import StringStringEntryProto
        except ImportError:
            raise ImportError(
                "You need to install `skl2onnx` to store models in onnx format"
            )

        onnx = convert_sklearn(
            model,
            name=label_text,
            initial_types=[("input", FloatTensorType((None, len(feature_names))))],
            doc_string="Model created by aict-tools to estimate {}".format(label_text),
        )

        # this makes sure we only get the scores and that they are numpy arrays and not
        # a list of dicts.
        # must come before setting metadata as it clears the metadata_props
        if hasattr(model, "predict_proba"):
            onnx = select_model_inputs_outputs(onnx, ["probabilities"])

        metadata = dict(
            model_author="aict-tools",
            aict_tools_version=__version__,
            feature_names=",".join(feature_names),
            model_type="classifier" if is_classifier(model) else "regressor",
        )
        for key, value in metadata.items():
            onnx.metadata_props.append(StringStringEntryProto(key=key, value=value))

        with open(model_path, "wb") as f:
            f.write(onnx.SerializeToString())
    else:
        pickle_path = model_path

    # Always store the pickle dump,just in case
    joblib.dump(model, pickle_path, compress=4)
Exemple #9
0
def pmml(x, Y):
    from sklearn2pmml import PMMLPipeline, sklearn2pmml

    LR_pipeline = PMMLPipeline([
        ("classifier", LogisticRegression())
    ])

    # 训练模型
    LR_pipeline.fit(x, Y)
    sklearn2pmml(LR_pipeline, "LogisticRegression.pmml")
Exemple #10
0
def build_auto_na(regressor, name):
	mapper = DataFrameMapper(
		[([column], [ContinuousDomain(missing_values = None), Imputer()]) for column in ["acceleration", "displacement", "horsepower", "weight"]] +
		[([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year", "origin"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	pipeline.fit(auto_na_X, auto_na_y)
	store_pkl(pipeline, name + ".pkl")
	mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"])
	store_csv(mpg, name + ".csv")
Exemple #11
0
def pickle_model(classifier, feature_names, model_path, label_text='label'):
    p, extension = path.splitext(model_path)
    classifier.feature_names = feature_names

    if (extension == '.pmml'):
        joblib.dump(classifier, p + '.pkl', compress=4)

        pipeline = PMMLPipeline([('classifier', classifier)])
        pipeline.target_field = label_text
        pipeline.active_fields = np.array(feature_names)
        sklearn2pmml(pipeline, model_path)

    else:
        joblib.dump(classifier, model_path, compress=4)
Exemple #12
0
class FirstStep(object):
    def __init__(self):
        self.__iris = load_iris()
        self.__X = pd.DataFrame(self.__iris.data,
                                columns=self.__iris.feature_names)
        self.__y = pd.DataFrame(self.__iris.target, columns=["Species"])
        self.__train = None
        self.__train_label = None
        self.__test = None
        self.__test_one_sample = None
        self.__test_label = None
        self.__mapper = None
        self.__estimator = None
        self.__pipeline = None

    def train_test_split_step(self):
        self.__train, self.__test, self.__train_label, self.__test_label = (
            train_test_split(self.__X, self.__y, test_size=0.2))
        self.__train = self.__train.reset_index(drop=True)
        self.__train_label = self.__train_label.reset_index(drop=True)
        self.__test = self.__test.reset_index(drop=True)
        self.__test_label = self.__train.reset_index(drop=True)

    def feature_engineering_step(self):
        self.__mapper = (DataFrameMapper([([
            "sepal length (cm)", "sepal width (cm)", "petal length (cm)",
            "petal width (cm)"
        ], [StandardScaler()])]))

    def model_train_step(self):
        self.__estimator = DecisionTreeClassifier()

    def pipeline_step(self):
        self.__pipeline = PMMLPipeline([("mapper", self.__mapper),
                                        ("estimator", self.__estimator)])
        self.__pipeline.fit(self.__train, self.__train_label)

    def output_step(self):
        joblib.dump(self.__pipeline,
                    "C:\\Users\\Dell\\Desktop\\pipeline.pkl.z",
                    compress=3)

    def input_step(self):
        self.__pipeline = joblib.load(
            "C:\\Users\\Dell\\Desktop\\pipeline.pkl.z")
        self.__test_one_sample = self.__test[0:1]
        print(self.__pipeline.predict(self.__test))
        # 传入一行记录
        print(self.__pipeline.predict(self.__test_one_sample))
Exemple #13
0
    def __init__(self):
        print('读取语料库:')
        seed_list, content_list = self.get_data(
            './data/豆瓣')  # 文件格式:老无所依\t差评\t我不能因为它得了奥斯卡就说明它好看,我不能因。。。

        print('\t' + '好评数:' + str(len(seed_list)) + '  差评数:' +
              str(len(content_list)))

        seed_y = [0 for i in range(0, len(seed_list))]
        content_y = [1 for i in range(0, len(content_list))]

        queries = content_list + seed_list
        y = content_y + seed_y

        # 数据矢量化
        self.vectorizer = TfidfVectorizer(tokenizer=self.get_ngrams)
        X = self.vectorizer.fit_transform(queries)
        print('向量化后维度:' + str(X.shape))

        print('划分训练集、测试集...')
        # 使用 train_test_split 分割 X y 列表
        # X_train矩阵的数目对应 y_train列表的数目(一一对应)  -->> 用来训练模型
        # X_test矩阵的数目对应 	 (一一对应) -->> 用来测试模型的准确性
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=46)
        print('划分完成,训练集开始训练分类器...')

        #self.model = LogisticRegression()
        self.model = svm.SVC()
        #self.model=MultinomialNB(alpha=0.001)
        self.pipeline = PMMLPipeline([("classifier", self.model)])
        self.pipeline.fit(X_train, y_train)

        joblib.dump(self.pipeline, "./result/classifier.pkl.z",
                    compress=9)  # compress压缩程度

        print('训练完毕!!!  测试集开始预测结果...')
        predict = self.pipeline.predict(X_test)
        print("精度:{0:f}".format(
            metrics.precision_score(y_test, predict, average="weighted")))
        print("召回:{0:f}".format(
            metrics.recall_score(y_test, predict, average="weighted")))
        print("f1-score:{0:f}".format(
            metrics.f1_score(y_test, predict, average="weighted")))
        print("预测完毕!!!!")
        print('***********************************************************')
        print('***********************************************************')
Exemple #14
0
def build_auto(regressor, name):
	mapper = DataFrameMapper([
		(["cylinders"], CategoricalDomain()),
		(["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), Imputer(missing_values = "NaN"), StandardScaler()]),
		(["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects
		(["origin"], OneHotEncoder())
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y)
	store_pkl(pipeline, name + ".pkl")
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name + ".csv")
class XgbModel(object):
    def __init__(self, train, train_label, test, test_label):
        self.__train = train
        self.__train_label = train_label
        self.__test = test
        self.__test_label = test_label
        self.__bst = None
        self.__feat_imp = None
        self.__test_preds = None
        self.__test_predictions = None
        self.__output = None

    def train(self):
        self.__bst = XGBClassifier(objective="binary:logistic")
        self.__bst = PMMLPipeline([("estimator", self.__bst)])
        self.__bst.fit(self.__train,
                       self.__train_label,
                       estimator__eval_metric="auc")

    def predict(self):
        self.__test_preds = self.__bst.predict_proba(self.__test)[:, 1]
        self.__test_predictions = self.__bst.predict(self.__test)

    def feature_importances(self):
        self.__feat_imp = (pd.Series(
            self.__bst.feature_importances_,
            ["gbc", "rf", "ab", "lr"]).sort_values(ascending=False))
        self.__feat_imp.plot(kind="bar", title="Feature Importances")
        plt.ylabel("Feature Importance Score")
        plt.show()

    def evaluate(self):
        print("auc : %.4f" %
              roc_auc_score(self.__test_label, self.__test_preds))
        print("accuracy score : %.4f" %
              accuracy_score(self.__test_label, self.__test_predictions))

    def evaluate_output(self):
        self.__output = np.hstack(
            (self.__test, self.__test_label.reshape(
                (-1, 1)), self.__test_preds.reshape((-1, 1))))
        pd.DataFrame(
            self.__output).to_csv("C:\\Users\\Dell\\Desktop\\output.csv")

    def xgbmodel_output(self):
        joblib.dump(self.__bst,
                    "C:\\Users\\Dell\\Desktop\\bstML.pkl.z",
                    compress=True)
Exemple #16
0
def gen_pmml_to_hdfs(sc, model, args):
    """ 生成 pmml,pkl 到 hdfs """

    # 先将pmml文件生成到driver的tmp目录下,然后再将其上传至HDFS
    # 目录名称 /tmp/时间戳/文件名
    print("===========> 保存模型文件到 HDFS")
    pmml_model_name = constants.PMML_NAME
    pkl_model_name = constants.PKL_NAME
    dir_name = "/tmp/" + str(time.time())
    args.tmp_dir = dir_name
    os.mkdir(dir_name)

    # 保存为pmml文件
    pipeline = PMMLPipeline([("classifier", model)])
    sklearn2pmml(pipeline, dir_name + os.sep + pmml_model_name, with_repr=True)
    joblib.dump(model, dir_name + os.sep + pkl_model_name)

    # 上传文件至HDFS
    with open(dir_name + os.sep + pmml_model_name, "r") as f1, \
            open(dir_name + os.sep + pkl_model_name, "rb") as f2:
        data1 = f1.read()
        data2 = f2.read()
        save_data.write_data_to_cluster(sc, args.export_dir + os.sep + pmml_model_name, data1)
        save_data.write_data_to_cluster(sc, args.model_dir + os.sep + pkl_model_name, data2, is_text_file=False)

    # 删除临时文件
    os.remove(dir_name + os.sep + pmml_model_name)
    os.remove(dir_name + os.sep + pkl_model_name)
Exemple #17
0
    def __model_to_pmml__():

        pipeline = PMMLPipeline([("regressor", self.regressor)])
        sklearn2pmml(pipeline, "pmml", with_repr=True)

        print('creating pmml')
        # Read in the file
        with open('pmml', 'r') as file:
            filedata = file.read()

        print('finding matches')
        # Replace x[1-...] with actual column names
        m = re.findall('x\-?\d+', filedata)
        matches = []
        print('sorting matches')
        for match in m:
            if match in matches:
                break
            matches.append(match)
        feature_cols = list(
            data_df.columns.difference(
                ["in_set", "smiles", "id", self.target_name]))
        matched_dict = dict(zip(matches, feature_cols))
        print('replacing')
        for match, feat in matched_dict.items():
            filedata = filedata.replace(match, f'{feat}')

        # Replace y with target name
        filedata = filedata.replace('y', f'{self.target_name}')

        print('rewrite to file')
        # Write the file out again
        with open('pmml', 'w') as file:
            file.write(filedata)
def get_training_data(con):
    data = pd.read_sql("""select 
                        user_responses.id as id, 
                        drink_name as drink,  
                        user_responses.question_name as question_name, 
                        question_choices.choice as choice,
                        session_id
                        from user_responses inner join question_choices
                        on user_responses.question_choice = question_choices.id"""
                       , con=con
                       , index_col='id')
    
    print(data)
    
    data = data.pivot(index='session_id', columns='question_name', values=['choice', 'drink'])
    
    print(data)
    
    pipeline = PMMLPipeline([
            ("transformation", DataFrameMapper([
                (["hotdog"], [CategoricalDomain(), LabelBinarizer()])
                , (["tp"], [CategoricalDomain(), LabelBinarizer()])
                , (["personality"], [CategoricalDomain(), LabelBinarizer()])
            ])),
            ("classifier", GaussianNB())
        ])
    return data, pipeline
Exemple #19
0
def pickle_model(classifier, feature_names, model_path, label_text='label'):
    p, extension = os.path.splitext(model_path)
    classifier.feature_names = feature_names

    if (extension == '.pmml'):
        joblib.dump(classifier, p + '.pkl', compress=4)

        pipeline = PMMLPipeline([
            ('classifier', classifier)
        ])
        pipeline.target_field = label_text
        pipeline.active_fields = np.array(feature_names)
        sklearn2pmml(pipeline, model_path)

    else:
        joblib.dump(classifier, model_path, compress=4)
Exemple #20
0
def build_wheat(kmeans, name, with_affinity=True):
    mapper = DataFrameMapper([(wheat_X.columns.values, ContinuousDomain())])
    pipeline = PMMLPipeline([("mapper", mapper), ("scaler", MinMaxScaler()),
                             ("clusterer", kmeans)])
    pipeline.fit(wheat_X)
    store_pkl(pipeline, name + ".pkl")
    cluster = DataFrame(pipeline.predict(wheat_X), columns=["Cluster"])
    if (with_affinity == True):
        Xt = pipeline_transform(pipeline, wheat_X)
        affinity_0 = kmeans_distance(kmeans, 0, Xt)
        affinity_1 = kmeans_distance(kmeans, 1, Xt)
        affinity_2 = kmeans_distance(kmeans, 2, Xt)
        cluster_affinity = DataFrame(
            numpy.transpose([affinity_0, affinity_1, affinity_2]),
            columns=["affinity(0)", "affinity(1)", "affinity(2)"])
        cluster = pandas.concat((cluster, cluster_affinity), axis=1)
    store_csv(cluster, name + ".csv")
Exemple #21
0
    def model_wrapper_fit(self):
        self.__model_list.extend([
            self.__gradient_boosting_classifier,
            self.__random_forest_classifier, self.__logistic_regression,
            self.__k_neighbors_classifier, self.__extra_tree_classifier,
            self.__xgb_classifier
        ])

        for model in self.__model_list:
            temp = PMMLPipeline([("estimator", model)])
            temp.fit(self.__train, self.__train_label)
            self.__pmml_model_list.append(temp)

        print(
            self.__logistic_regression.fit(self.__train,
                                           self.__train_label).coef_)
        print(
            self.__logistic_regression.fit(self.__train,
                                           self.__train_label).intercept_)
Exemple #22
0
	def test_fit_verify(self):
		pipeline = PMMLPipeline([("estimator", DummyRegressor())])
		self.assertFalse(hasattr(pipeline, "active_fields"))
		self.assertFalse(hasattr(pipeline, "target_fields"))
		X = DataFrame([[1, 0], [2, 0], [3, 0]], columns = ["X1", "X2"])
		y = Series([0.5, 1.0, 1.5], name = "y")
		pipeline.fit(X, y)
		self.assertEqual(["X1", "X2"], pipeline.active_fields.tolist())
		self.assertEqual("y", pipeline.target_fields.tolist())
		X.columns = ["x1", "x2"]
		pipeline.fit(X, y)
		self.assertEqual(["x1", "x2"], pipeline.active_fields.tolist())
		self.assertEqual("y", pipeline.target_fields.tolist())
		self.assertFalse(hasattr(pipeline, "verification"))
		pipeline.verify(X.sample(2))
		self.assertEqual(2, len(pipeline.verification.active_values))
		self.assertEqual(2, len(pipeline.verification.target_values))
		X.columns = ["x2", "x1"]
		with self.assertRaises(ValueError):
			pipeline.verify(X.sample(2))
Exemple #23
0
def build_audit(classifier, name, with_proba=True):
    mapper = DataFrameMapper([
        ("Age", ContinuousDomain()),
        ("Employment", [
            LabelBinarizer(),
            SelectFromModel(EstimatorProxy(
                DecisionTreeClassifier(random_state=13)),
                            threshold="1.25 * mean")
        ]),
        ("Education", [
            LabelBinarizer(),
            SelectorProxy(
                SelectFromModel(EstimatorProxy(
                    RandomForestClassifier(random_state=13, n_estimators=3)),
                                threshold="median"))
        ]), ("Marital", [LabelBinarizer(), SelectKBest(k=3)]),
        ("Occupation", [LabelBinarizer(),
                        SelectorProxy(SelectKBest(k=3))]),
        ("Income", ContinuousDomain()), ("Gender", LabelEncoder()),
        ("Deductions", LabelEncoder()), ("Hours", ContinuousDomain())
    ])
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_X, audit_y)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    if (with_proba == True):
        adjusted_proba = DataFrame(pipeline.predict_proba(audit_X),
                                   columns=["probability_0", "probability_1"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
Exemple #24
0
def build_sentiment(classifier, name, with_proba=True):
    pipeline = PMMLPipeline([
        ("tf-idf",
         TfidfVectorizer(
             analyzer="word",
             preprocessor=None,
             strip_accents=None,
             lowercase=True,
             token_pattern=None,
             tokenizer=Splitter(),
             stop_words="english",
             ngram_range=(1, 2),
             norm=None,
             dtype=(numpy.float32 if isinstance(
                 classifier, RandomForestClassifier) else numpy.float64))),
        ("selector", SelectorProxy(SelectPercentile(chi2, percentile=10))),
        ("classifier", classifier)
    ])
    pipeline.fit(sentiment_X, sentiment_y)
    store_pkl(pipeline, name + ".pkl")
    score = DataFrame(pipeline.predict(sentiment_X), columns=["Score"])
    if (with_proba == True):
        score_proba = DataFrame(pipeline.predict_proba(sentiment_X),
                                columns=["probability(0)", "probability(1)"])
        score = pandas.concat((score, score_proba), axis=1)
    store_csv(score, name + ".csv")
Exemple #25
0
def build_audit(classifier, name, with_proba = True, **kwargs):
	continuous_mapper = DataFrameMapper([
		("Age", ContinuousDomain()),
		("Income", ContinuousDomain()),
		("Hours", ContinuousDomain())
	])
	categorical_mapper = DataFrameMapper([
		("Employment", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(DecisionTreeClassifier(random_state = 13))))]),
		("Education", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(RandomForestClassifier(random_state = 13, n_estimators = 3)), threshold = "1.25 * mean"))]),
		("Marital", [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]),
		("Occupation", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectKBest(k = 3))]),
		("Gender", [CategoricalDomain(), LabelBinarizer(neg_label = -3, pos_label = 3)]),
		("Deductions", [CategoricalDomain(), LabelEncoder()]),
	])
	pipeline = PMMLPipeline([
		("union", FeatureUnion([
			("continuous", continuous_mapper),
			("categorical", Pipeline([
				("mapper", categorical_mapper),
				("polynomial", PolynomialFeatures())
			]))
		])),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, audit_y)
	customize(classifier, **kwargs)
	store_pkl(pipeline, name + ".pkl")
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	if(with_proba == True):
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name + ".csv")
Exemple #26
0
def build_iris(classifier, name, with_proba=True):
    pipeline = PMMLPipeline([
        ("union",
         FeatureUnion([("normal_scale",
                        DataFrameMapper([
                            (iris_X.columns.values, ContinuousDomain()),
                        ])),
                       ("log_scale",
                        DataFrameMapper([(iris_X.columns.values,
                                          FunctionTransformer(numpy.log10))]))
                       ])), ("scaler", RobustScaler()),
        ("pca", IncrementalPCA(n_components=3, whiten=True)),
        ("classifier", classifier)
    ])
    pipeline.fit(iris_X, iris_y)
    store_pkl(pipeline, name + ".pkl")
    species = DataFrame(pipeline.predict(iris_X), columns=["Species"])
    if (with_proba == True):
        species_proba = DataFrame(pipeline.predict_proba(iris_X),
                                  columns=[
                                      "probability(setosa)",
                                      "probability(versicolor)",
                                      "probability(virginica)"
                                  ])
        species = pandas.concat((species, species_proba), axis=1)
    store_csv(species, name + ".csv")
Exemple #27
0
def save_model_to_local_file(booster, model_params, meta, filename):
    from sklearn2pmml import PMMLPipeline, sklearn2pmml
    try:
        from xgboost.compat import XGBoostLabelEncoder
    except:  # noqa: E722
        # xgboost==0.82.0 does not have XGBoostLabelEncoder
        # in xgboost.compat.py
        from xgboost.sklearn import XGBLabelEncoder as XGBoostLabelEncoder

    objective = model_params.get("objective")
    bst_meta = dict()

    if objective.startswith("binary:") or objective.startswith("multi:"):
        if objective.startswith("binary:"):
            num_class = 2
        else:
            num_class = model_params.get("num_class")
            assert num_class is not None and num_class > 0, \
                "num_class should not be None"

        # To fake a trained XGBClassifier, there must be "_le", "classes_",
        # inside XGBClassifier. See here:
        # https://github.com/dmlc/xgboost/blob/d19cec70f1b40ea1e1a35101ca22e46dd4e4eecd/python-package/xgboost/sklearn.py#L356
        model = xgb.XGBClassifier()
        label_encoder = XGBoostLabelEncoder()
        label_encoder.fit(list(range(num_class)))
        model._le = label_encoder
        model.classes_ = model._le.classes_

        bst_meta["_le"] = {"classes_": model.classes_.tolist()}
        bst_meta["classes_"] = model.classes_.tolist()
    elif objective.startswith("reg:"):
        model = xgb.XGBRegressor()
    elif objective.startswith("rank:"):
        model = xgb.XGBRanker()
    else:
        raise ValueError(
            "Not supported objective {} for saving PMML".format(objective))

    model_type = type(model).__name__
    bst_meta["type"] = model_type

    # Meta data is needed for saving sklearn pipeline. See here:
    # https://github.com/dmlc/xgboost/blob/d19cec70f1b40ea1e1a35101ca22e46dd4e4eecd/python-package/xgboost/sklearn.py#L356
    booster.set_attr(scikit_learn=json.dumps(bst_meta))
    booster.save_model(filename)
    save_model_metadata("model_meta.json", meta)
    booster.set_attr(scikit_learn=None)
    model.load_model(filename)

    pipeline = PMMLPipeline([(model_type, model)])
    sklearn2pmml(pipeline, "{}.pmml".format(filename))
Exemple #28
0
def get_model(PARAMS):
    """
    Get model according to given parameters.

    :param PARAMS:
    :return:
    """
    estimator = GradientBoostingClassifier()
    for k in PARAMS:
        if hasattr(estimator, k):
            setattr(estimator, k, PARAMS.get(k))
    pipeline = PMMLPipeline([('estimator', estimator)])
    return pipeline
def get_sample_data(con):
    data = pd.read_sql("select * from sample_training_data"
                       , con=con
                       , index_col="id")
    
    pipeline = PMMLPipeline([
            ("transformation", DataFrameMapper([
                (["hotdog"], [CategoricalDomain(), LabelBinarizer()]),
                (["tp"], [CategoricalDomain(), LabelBinarizer()])
            ])),
            ("classifier", GaussianNB())
        ])
    return data, pipeline
Exemple #30
0
def build_wheat(kmeans, name, with_affinity=True):
    mapper = DataFrameMapper([([
        "Area", "Perimeter", "Compactness", "Kernel.Length", "Kernel.Width",
        "Asymmetry", "Groove.Length"
    ], ContinuousDomain())])
    pipeline = PMMLPipeline([("mapper", mapper),
                             ("transformer", FunctionTransformer(numpy.log10)),
                             ("scaler", MinMaxScaler()),
                             ("clusterer", kmeans)])
    pipeline.fit(wheat_X)
    store_pkl(pipeline, name + ".pkl")
    cluster = DataFrame(pipeline.predict(wheat_X), columns=["Cluster"])
    if (with_affinity == True):
        Xt = pipeline_transform(pipeline, wheat_X)
        affinity_0 = kmeans_distance(kmeans, 0, Xt)
        affinity_1 = kmeans_distance(kmeans, 1, Xt)
        affinity_2 = kmeans_distance(kmeans, 2, Xt)
        cluster_affinity = DataFrame(
            numpy.transpose([affinity_0, affinity_1, affinity_2]),
            columns=["affinity_0", "affinity_1", "affinity_2"])
        cluster = pandas.concat((cluster, cluster_affinity), axis=1)
    store_csv(cluster, name + ".csv")
def get_model(PARAMS):
    """
    Get model according to given parameters.

    :param PARAMS:
    :return:
    """
    estimator = LogisticRegression()
    for k in PARAMS:
        if hasattr(estimator, k):
            setattr(estimator, k, PARAMS.get(k))
    pipeline = PMMLPipeline([('estimator', estimator)])
    return pipeline