Exemple #1
0
    def test_xgboost_02(self):
        auto = pd.read_csv('nyoka/tests/auto-mpg.csv')
        feature_names = [
            name for name in auto.columns if name not in ('mpg', 'car name')
        ]
        target_name = 'mpg'
        f_name = "xgbr_pmml.pmml"
        model = XGBRegressor()
        pipeline_obj = Pipeline([('xgbr', model)])

        pipeline_obj.fit(auto[feature_names], auto[target_name])
        xgboost_to_pmml(pipeline_obj,
                        feature_names,
                        target_name,
                        f_name,
                        description="A test model")
        pmml_obj = pml.parse(f_name, True)

        pmml_value_list = []
        model_value_list = []

        pmml_score_list = []
        model_score_list = []

        seg_tab = pmml_obj.MiningModel[0].Segmentation.Segment
        for seg in seg_tab:
            for node in seg.TreeModel.Node.Node:
                varlen = node.get_Node().__len__()
                if varlen > 0:
                    pmml_value_list.append(node.SimplePredicate.value)
                    self.extractValues(node, pmml_value_list, pmml_score_list)
                else:
                    pmml_value_list.append(node.SimplePredicate.value)
                    pmml_score_list.append(node.score)

        get_nodes_in_json_format = []
        for i in range(model.n_estimators):
            get_nodes_in_json_format.append(
                json.loads(model._Booster.get_dump(dump_format='json')[i]))

        for i in range(len(get_nodes_in_json_format)):
            list_score_temp = []
            list_val_temp = []
            node_list = get_nodes_in_json_format[i]
            self.create_node(node_list, list_score_temp, list_val_temp)
            model_score_list = model_score_list + list_score_temp
            model_value_list = model_value_list + list_val_temp
            list_val_temp.clear()
            list_score_temp.clear()

        ##1
        for model_val, pmml_val in zip(model_score_list, pmml_score_list):
            self.assertEqual(model_val, float(pmml_val))

        ##2
        for model_val, pmml_val in zip(model_value_list, pmml_value_list):
            self.assertEqual(model_val, pmml_val)

        ##3
        self.assertEqual(os.path.isfile(f_name), True)
    def test_xgboost_04(self):

        auto = pd.read_csv('nyoka/tests/auto-mpg.csv')
        X = auto.drop(['mpg'], axis=1)
        y = auto['mpg']

        feature_names = [name for name in auto.columns if name not in ('mpg')]

        target_name = 'mpg'
        x_train, x_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.33,
                                                            random_state=101)
        pipeline_obj = Pipeline([
            ('mapper',
             DataFrameMapper([('car name', CountVectorizer()),
                              (['displacement'], [StandardScaler()])])),
            ('lgbmr', XGBRegressor())
        ])
        pipeline_obj.fit(x_train, y_train)

        xgboost_to_pmml(pipeline_obj, feature_names, target_name,
                        "xgbr_pmml_preprocess2.pmml")

        self.assertEqual(os.path.isfile("xgbr_pmml_preprocess2.pmml"), True)
Exemple #3
0
def train(data_conf, model_conf, **kwargs):
    hyperparams = model_conf["hyperParameters"]

    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"],
                   database=data_conf["schema"] if "schema" in data_conf and data_conf["schema"] != "" else None)

    feature_names = ["NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns", "BMI", "DiPedFunc", "Age"]
    target_name = "HasDiabetes"

    # read training dataset from Teradata and convert to pandas
    train_df = DataFrame(data_conf["table"])
    train_df = train_df.select([feature_names + [target_name]])
    train_pdf = train_df.to_pandas()

    # split data into X and y
    X_train = train_pdf.drop(target_name, 1)
    y_train = train_pdf[target_name]

    print("Starting training...")

    # fit model to training data
    model = Pipeline([('scaler', MinMaxScaler()),
                      ('xgb', XGBClassifier(eta=hyperparams["eta"],
                                            max_depth=hyperparams["max_depth"]))])
    # xgboost saves feature names but lets store on pipeline for easy access later
    model.feature_names = feature_names
    model.target_name = target_name

    model.fit(X_train, y_train)

    print("Finished training")

    # export model artefacts
    joblib.dump(model, "artifacts/output/model.joblib")

    # we can also save as pmml so it can be used for In-Vantage scoring etc.
    xgboost_to_pmml(pipeline=model, col_names=feature_names, target_name=target_name,
                    pmml_f_name="artifacts/output/model.pmml")

    print("Saved trained model")

    from xgboost import plot_importance
    model["xgb"].get_booster().feature_names = feature_names
    plot_importance(model["xgb"].get_booster(), max_num_features=10)
    save_plot("feature_importance.png")

    feature_importance = model["xgb"].get_booster().get_score(importance_type="weight")
    stats.record_stats(train_df,
                       features=feature_names,
                       predictors=["HasDiabetes"],
                       categorical=["HasDiabetes"],
                       importance=feature_importance,
                       category_labels={"HasDiabetes": {0: "false", 1: "true"}})
 def test_02_xgb_regressor(self):
     print("\ntest 02 (xgb regressor without preprocessing)\n")
     model = XGBRegressor()
     pipeline_obj = Pipeline([
         ("model", model)
     ])
     pipeline_obj.fit(self.X,self.Y)
     file_name = "test02xgboost.pmml"
     xgboost_to_pmml(pipeline_obj, self.features, 'Species', file_name)
     model_name  = self.adapa_utility.upload_to_zserver(file_name)
     predictions, _ = self.adapa_utility.score_in_zserver(model_name, self.test_file)
     model_pred = pipeline_obj.predict(self.X)
     self.assertEqual(self.adapa_utility.compare_predictions(predictions, model_pred), True)
Exemple #5
0
def train(data_conf, model_conf, **kwargs):
    hyperparams = model_conf["hyperParameters"]

    feature_names = [
        "NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns",
        "BMI", "DiPedFunc", "Age"
    ]
    target_name = "HasDiabetes"

    # in a real world scenario, you would read from S3, HDFS, Teradata,
    # etc but for demo reading from url. we could read via pandas.read_csv but just to show pyspark ...
    urllib.request.urlretrieve(data_conf["url"], "/tmp/data.csv")
    all_columns = feature_names + [target_name]
    train_df = spark.read.format("csv")\
        .option("inferSchema", "true")\
        .load("/tmp/data.csv")\
        .toDF(*all_columns)

    # do feature eng in spark / joins whatever reason you're using pyspark...
    # split into test and train
    train_df = train_df.randomSplit([0.7, 0.3], 42)[0].toPandas()

    # split data into X and y
    X_train = train_df.drop(target_name, 1)
    y_train = train_df[target_name]

    print("Starting training...")

    # fit model to training data
    model = Pipeline([('scaler', MinMaxScaler()),
                      ('xgb',
                       XGBClassifier(eta=hyperparams["eta"],
                                     max_depth=hyperparams["max_depth"]))])
    # xgboost saves feature names but lets store on pipeline for easy access
    model.feature_names = feature_names

    model.fit(X_train, y_train)

    print("Finished training")

    # export model artefacts
    joblib.dump(model, "artifacts/output/model.joblib")

    # we can also save as pmml so it can be used for In-Vantage scoring etc.
    xgboost_to_pmml(pipeline=model,
                    col_names=feature_names[0:8],
                    target_name=target_name,
                    pmml_f_name="artifacts/output/model.pmml")

    print("Saved trained model")
    def test_xgboost_05(self):

        iris = datasets.load_iris()
        irisd = pd.DataFrame(iris.data, columns=iris.feature_names)
        irisd['target'] = [i % 2 for i in range(iris.data.shape[0])]

        features = irisd.columns.drop('target')
        target = 'target'

        pipeline_obj = Pipeline([('lgbmc', XGBClassifier())])

        pipeline_obj.fit(irisd[features], irisd[target])

        xgboost_to_pmml(pipeline_obj, features, target, "xgbc_bin_pmml.pmml")

        self.assertEqual(os.path.isfile("xgbc_bin_pmml.pmml"), True)
    def test_xgboost_01(self):

        iris = datasets.load_iris()
        irisd = pd.DataFrame(iris.data, columns=iris.feature_names)
        irisd['Species'] = iris.target

        features = irisd.columns.drop('Species')
        target = 'Species'

        pipeline_obj = Pipeline([('lgbmc', XGBClassifier())])

        pipeline_obj.fit(irisd[features], irisd[target])

        xgboost_to_pmml(pipeline_obj, features, target, "xgbc_pmml.pmml")

        self.assertEqual(os.path.isfile("xgbc_pmml.pmml"), True)
 def test_03_xgb_classifier(self):
     print("\ntest 03 (xgb classifier with preprocessing) [binary-class]\n")
     model = XGBClassifier()
     pipeline_obj = Pipeline([
         ('scaler',MinMaxScaler()),
         ("model", model)
     ])
     pipeline_obj.fit(self.X,self.Y_bin)
     file_name = "test03xgboost.pmml"
     xgboost_to_pmml(pipeline_obj, self.features, 'Species', file_name)
     model_name  = self.adapa_utility.upload_to_zserver(file_name)
     predictions, probabilities = self.adapa_utility.score_in_zserver(model_name, self.test_file)
     model_pred = pipeline_obj.predict(self.X)
     model_prob = pipeline_obj.predict_proba(self.X)
     self.assertEqual(self.adapa_utility.compare_predictions(predictions, model_pred), True)
     self.assertEqual(self.adapa_utility.compare_probability(probabilities, model_prob), True)
def train(data_conf, model_conf, **kwargs):
    hyperparams = model_conf["hyperParameters"]

    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"])

    feature_names = [
        "NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns",
        "BMI", "DiPedFunc", "Age"
    ]
    target_name = "HasDiabetes"

    # read training dataset from Teradata and convert to pandas
    train_df = DataFrame(data_conf["table"])
    train_df = train_df.select([feature_names + [target_name]])
    train_df = train_df.to_pandas()

    # split data into X and y
    X_train = train_df.drop(target_name, 1)
    y_train = train_df[target_name]

    print("Starting training...")

    # fit model to training data
    model = Pipeline([('scaler', MinMaxScaler()),
                      ('xgb',
                       XGBClassifier(eta=hyperparams["eta"],
                                     max_depth=hyperparams["max_depth"]))])
    # xgboost saves feature names but lets store on pipeline for easy access later
    model.feature_names = feature_names
    model.target_name = target_name

    model.fit(X_train, y_train)

    print("Finished training")

    # export model artefacts
    joblib.dump(model, "artifacts/output/model.joblib")

    # we can also save as pmml so it can be used for In-Vantage scoring etc.
    xgboost_to_pmml(pipeline=model,
                    col_names=feature_names,
                    target_name=target_name,
                    pmml_f_name="artifacts/output/model.pmml")

    print("Saved trained model")
    def test_xgboost_02(self):

        auto = pd.read_csv('nyoka/tests/auto-mpg.csv')
        X = auto.drop(['mpg','car name'], axis=1)
        y = auto['mpg']

        feature_names = [name for name in auto.columns if name not in ('mpg','car name')]
        target_name='mpg'

        pipeline_obj = Pipeline([
            ('lgbmr',XGBRegressor())
        ])

        pipeline_obj.fit(auto[feature_names],auto[target_name])

        xgboost_to_pmml(pipeline_obj,feature_names,target_name,"xgbr_pmml.pmml")

        self.assertEqual(os.path.isfile("xgbr_pmml.pmml"),True)
Exemple #11
0
    def test_xgboost_06(self):
        iris = datasets.load_iris()
        irisd = pd.DataFrame(iris.data, columns=iris.feature_names)
        irisd['Species'] = iris.target

        features = irisd.columns.drop('Species')
        target = 'Species'
        f_name = "xgbc_pmml.pmml"

        model = XGBClassifier()

        model.fit(irisd[features], irisd[target])

        with self.assertRaises(TypeError):
            xgboost_to_pmml(model,
                            features,
                            target,
                            f_name,
                            model_name="testModel")
    def test_xgboost_03(self):

        iris = datasets.load_iris()
        irisd = pd.DataFrame(iris.data, columns=iris.feature_names)
        irisd['Species'] = iris.target

        features = irisd.columns.drop('Species')
        target = 'Species'

        pipeline_obj = Pipeline([('scaling', StandardScaler()),
                                 ('LGBMC_preprocess',
                                  XGBClassifier(n_estimators=5))])

        pipeline_obj.fit(irisd[features], irisd[target])

        xgboost_to_pmml(pipeline_obj, features, target,
                        "xgbc_pmml_preprocess.pmml")

        self.assertEqual(os.path.isfile("xgbc_pmml_preprocess.pmml"), True)
Exemple #13
0
def train(data_conf, model_conf, **kwargs):
    hyperparams = model_conf["hyperParameters"]

    feature_names = [
        "NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns",
        "BMI", "DiPedFunc", "Age"
    ]
    target_name = "HasDiabetes"

    train_df = read_dataframe(spark, data_conf["url"])

    # do feature eng in spark / joins whatever reason you're using pyspark...
    # split into test and train
    train_df = train_df.randomSplit([0.7, 0.3], 42)[0].toPandas()

    # split data into X and y
    X_train = train_df.drop(target_name, 1)
    y_train = train_df[target_name]

    print("Starting training...")

    # fit model to training data
    model = Pipeline([('scaler', MinMaxScaler()),
                      ('xgb',
                       XGBClassifier(eta=hyperparams["eta"],
                                     max_depth=hyperparams["max_depth"]))])
    # xgboost saves feature names but lets store on pipeline for easy access
    model.feature_names = feature_names

    model.fit(X_train, y_train)

    print("Finished training")

    # export model artefacts
    joblib.dump(model, "artifacts/output/model.joblib")

    # we can also save as pmml so it can be used for In-Vantage scoring etc.
    xgboost_to_pmml(pipeline=model,
                    col_names=feature_names[0:8],
                    target_name=target_name,
                    pmml_f_name="artifacts/output/model.pmml")

    print("Saved trained model")
Exemple #14
0
                                                    random_state=seed)

pipeline = Pipeline([('scaling', StandardScaler()),
                     ('xgb', XGBClassifier(n_estimators=5, seed=seed))])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)

import pickle
import numpy as np

d = pickle.dumps(pipeline)
saved_pipeline = pickle.loads(d)
y_pred_saved = saved_pipeline.predict(X_test)
y_pred_proba_saved = saved_pipeline.predict_proba(X_test)

assert np.array_equal(y_pred, y_pred_saved), "Not equal after saved"
assert np.array_equal(y_pred_proba_saved,
                      y_pred_proba), "Not equal after saved"

from nyoka import xgboost_to_pmml
xgboost_to_pmml(saved_pipeline, features, target, "xgb-iris.pmml")

from pypmml import Model
model = Model.fromFile("xgb-iris.pmml")
y_pred_pmml = model.predict(X_test)

assert np.array_equal(
    y_pred, y_pred_pmml["predicted_Species"]), "Not equal after saved"
Exemple #15
0
    def test_xgboost_03(self):
        iris = datasets.load_iris()
        irisd = pd.DataFrame(iris.data, columns=iris.feature_names)
        irisd['Species'] = iris.target

        features = irisd.columns.drop('Species')
        target = 'Species'
        f_name = "xgbc_pmml_preprocess.pmml"
        model = XGBClassifier(n_estimators=5)
        pipeline_obj = Pipeline([('scaling', StandardScaler()),
                                 ('xgbc', model)])

        pipeline_obj.fit(irisd[features], irisd[target])
        xgboost_to_pmml(pipeline_obj, features, target, f_name)
        pmml_obj = pml.parse(f_name, True)

        pmml_value_list = []
        model_value_list = []

        pmml_score_list = []
        model_score_list = []

        list_seg_score1 = []
        list_seg_score2 = []
        list_seg_score3 = []

        list_seg_val1 = []
        list_seg_val2 = []
        list_seg_val3 = []

        get_nodes_in_json_format = []
        for i in range(model.n_estimators * model.n_classes_):
            get_nodes_in_json_format.append(
                json.loads(model._Booster.get_dump(dump_format='json')[i]))

        n = 1
        for i in range(len(get_nodes_in_json_format)):
            list_score_temp = []
            list_val_temp = []
            node_list = get_nodes_in_json_format[i]
            if n == 1:
                n = 2
                self.create_node(node_list, list_score_temp, list_val_temp)
                list_seg_score1 = list_seg_score1 + list_score_temp
                list_seg_val1 = list_seg_val1 + list_val_temp
                list_val_temp.clear()
                list_score_temp.clear()
            elif n == 2:
                n = 3
                self.create_node(node_list, list_score_temp, list_val_temp)
                list_seg_score2 = list_seg_score2 + list_score_temp
                list_seg_val2 = list_seg_val2 + list_val_temp
                list_val_temp.clear()
                list_score_temp.clear()
            elif n == 3:
                n = 1
                self.create_node(node_list, list_score_temp, list_val_temp)
                list_seg_score3 = list_seg_score3 + list_score_temp
                list_seg_val3 = list_seg_val3 + list_val_temp
                list_val_temp.clear()
                list_score_temp.clear()

        model_score_list = list_seg_score1 + list_seg_score2 + list_seg_score3
        model_value_list = list_seg_val1 + list_seg_val2 + list_seg_val3

        seg_tab = pmml_obj.MiningModel[0].Segmentation.Segment
        for seg in seg_tab:
            if int(seg.id) <= 3:
                for segment in seg.MiningModel.Segmentation.Segment:
                    node_tab = segment.TreeModel.Node.Node
                    if not node_tab:
                        pmml_score_list.append(segment.TreeModel.Node.score)
                    else:
                        for node in node_tab:
                            varlen = node.get_Node().__len__()
                            if varlen > 0:
                                pmml_value_list.append(
                                    node.SimplePredicate.value)
                                self.extractValues(node, pmml_value_list,
                                                   pmml_score_list)
                            else:
                                pmml_value_list.append(
                                    node.SimplePredicate.value)
                                pmml_score_list.append(node.score)

        ##1
        for model_val, pmml_val in zip(model_score_list, pmml_score_list):
            self.assertEqual(model_val, float(pmml_val))

        ##2
        for model_val, pmml_val in zip(model_value_list, pmml_value_list):
            self.assertEqual(model_val, pmml_val)

        ##3
        self.assertEqual(os.path.isfile(f_name), True)
Exemple #16
0
    def test_xgboost_04(self):
        auto = pd.read_csv('nyoka/tests/auto-mpg.csv')
        X = auto.drop(['mpg'], axis=1)
        y = auto['mpg']

        feature_names = [name for name in auto.columns if name not in 'mpg']
        f_name = "xgbr_pmml_preprocess2.pmml"
        target_name = 'mpg'
        x_train, x_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.33,
                                                            random_state=101)
        model = XGBRegressor()
        pipeline_obj = Pipeline([
            ('mapper',
             DataFrameMapper([('car name', CountVectorizer()),
                              (['displacement'], [StandardScaler()])])),
            ('xgbr', model)
        ])

        pipeline_obj.fit(x_train, y_train)
        xgboost_to_pmml(pipeline_obj, feature_names, target_name, f_name)
        pmml_obj = pml.parse(f_name, True)

        pmml_value_list = []
        model_value_list = []

        pmml_score_list = []
        model_score_list = []

        seg_tab = pmml_obj.MiningModel[0].Segmentation.Segment

        for seg in seg_tab:
            for node in seg.TreeModel.Node.Node:
                varlen = node.get_Node().__len__()
                if varlen > 0:
                    pmml_value_list.append(node.SimplePredicate.value)
                    self.extractValues(node, pmml_value_list, pmml_score_list)
                else:
                    pmml_value_list.append(node.SimplePredicate.value)
                    pmml_score_list.append(node.score)

        get_nodes_in_json_format = []
        for i in range(model.n_estimators):
            get_nodes_in_json_format.append(
                json.loads(model._Booster.get_dump(dump_format='json')[i]))

        for i in range(len(get_nodes_in_json_format)):
            list_score_temp = []
            list_val_temp = []
            node_list = get_nodes_in_json_format[i]
            self.create_node(node_list, list_score_temp, list_val_temp)
            model_score_list = model_score_list + list_score_temp
            model_value_list = model_value_list + list_val_temp
            list_val_temp.clear()
            list_score_temp.clear()

        ##1
        for model_val, pmml_val in zip(model_score_list, pmml_score_list):
            self.assertEqual(model_val, float(pmml_val))

        ##2
        for model_val, pmml_val in zip(model_value_list, pmml_value_list):
            self.assertEqual(model_val, pmml_val)

        ##3
        self.assertEqual(os.path.isfile(f_name), True)
Exemple #17
0
    def test_xgboost_05(self):
        iris = datasets.load_iris()
        irisd = pd.DataFrame(iris.data, columns=iris.feature_names)
        irisd['target'] = [i % 2 for i in range(iris.data.shape[0])]

        features = irisd.columns.drop('target')
        target = 'target'
        f_name = "xgbc_bin_pmml.pmml"
        model = XGBClassifier(min_child_weight=6,
                              n_estimators=10,
                              scale_pos_weight=10,
                              deterministic_histogram=False)
        pipeline_obj = Pipeline([('xgbc', model)])

        pipeline_obj.fit(irisd[features], irisd[target])
        xgboost_to_pmml(pipeline_obj, features, target, f_name)
        pmml_obj = pml.parse(f_name, True)

        pmml_value_list = []
        model_value_list = []

        pmml_score_list = []
        model_score_list = []

        seg_tab = pmml_obj.MiningModel[0].Segmentation.Segment
        for seg in seg_tab:
            if int(seg.id) == 1:
                for segment in seg.MiningModel.Segmentation.Segment:
                    node_tab = segment.TreeModel.Node.Node
                    if not node_tab:
                        pmml_score_list.append(segment.TreeModel.Node.score)
                    else:
                        for node in node_tab:
                            varlen = node.get_Node().__len__()
                            if varlen > 0:
                                pmml_value_list.append(
                                    node.SimplePredicate.value)
                                self.extractValues(node, pmml_value_list,
                                                   pmml_score_list)
                            else:
                                pmml_value_list.append(
                                    node.SimplePredicate.value)
                                pmml_score_list.append(node.score)

        get_nodes_in_json_format = []
        for i in range(model.n_estimators):
            get_nodes_in_json_format.append(
                json.loads(model._Booster.get_dump(dump_format='json')[i]))

        for i in range(len(get_nodes_in_json_format)):
            list_score_temp = []
            list_val_temp = []
            node_list = get_nodes_in_json_format[i]
            self.create_node(node_list, list_score_temp, list_val_temp)
            model_score_list = model_score_list + list_score_temp
            model_value_list = model_value_list + list_val_temp
            list_val_temp.clear()
            list_score_temp.clear()

        ##1
        for model_val, pmml_val in zip(model_score_list, pmml_score_list):
            self.assertEqual(model_val, float(pmml_val))

        ##2
        for model_val, pmml_val in zip(model_value_list, pmml_value_list):
            self.assertEqual(model_val, pmml_val)

        ##3
        self.assertEqual(os.path.isfile(f_name), True)