def test_xgboost_02(self): auto = pd.read_csv('nyoka/tests/auto-mpg.csv') feature_names = [ name for name in auto.columns if name not in ('mpg', 'car name') ] target_name = 'mpg' f_name = "xgbr_pmml.pmml" model = XGBRegressor() pipeline_obj = Pipeline([('xgbr', model)]) pipeline_obj.fit(auto[feature_names], auto[target_name]) xgboost_to_pmml(pipeline_obj, feature_names, target_name, f_name, description="A test model") pmml_obj = pml.parse(f_name, True) pmml_value_list = [] model_value_list = [] pmml_score_list = [] model_score_list = [] seg_tab = pmml_obj.MiningModel[0].Segmentation.Segment for seg in seg_tab: for node in seg.TreeModel.Node.Node: varlen = node.get_Node().__len__() if varlen > 0: pmml_value_list.append(node.SimplePredicate.value) self.extractValues(node, pmml_value_list, pmml_score_list) else: pmml_value_list.append(node.SimplePredicate.value) pmml_score_list.append(node.score) get_nodes_in_json_format = [] for i in range(model.n_estimators): get_nodes_in_json_format.append( json.loads(model._Booster.get_dump(dump_format='json')[i])) for i in range(len(get_nodes_in_json_format)): list_score_temp = [] list_val_temp = [] node_list = get_nodes_in_json_format[i] self.create_node(node_list, list_score_temp, list_val_temp) model_score_list = model_score_list + list_score_temp model_value_list = model_value_list + list_val_temp list_val_temp.clear() list_score_temp.clear() ##1 for model_val, pmml_val in zip(model_score_list, pmml_score_list): self.assertEqual(model_val, float(pmml_val)) ##2 for model_val, pmml_val in zip(model_value_list, pmml_value_list): self.assertEqual(model_val, pmml_val) ##3 self.assertEqual(os.path.isfile(f_name), True)
def test_xgboost_04(self): auto = pd.read_csv('nyoka/tests/auto-mpg.csv') X = auto.drop(['mpg'], axis=1) y = auto['mpg'] feature_names = [name for name in auto.columns if name not in ('mpg')] target_name = 'mpg' x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101) pipeline_obj = Pipeline([ ('mapper', DataFrameMapper([('car name', CountVectorizer()), (['displacement'], [StandardScaler()])])), ('lgbmr', XGBRegressor()) ]) pipeline_obj.fit(x_train, y_train) xgboost_to_pmml(pipeline_obj, feature_names, target_name, "xgbr_pmml_preprocess2.pmml") self.assertEqual(os.path.isfile("xgbr_pmml_preprocess2.pmml"), True)
def train(data_conf, model_conf, **kwargs): hyperparams = model_conf["hyperParameters"] create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"], database=data_conf["schema"] if "schema" in data_conf and data_conf["schema"] != "" else None) feature_names = ["NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns", "BMI", "DiPedFunc", "Age"] target_name = "HasDiabetes" # read training dataset from Teradata and convert to pandas train_df = DataFrame(data_conf["table"]) train_df = train_df.select([feature_names + [target_name]]) train_pdf = train_df.to_pandas() # split data into X and y X_train = train_pdf.drop(target_name, 1) y_train = train_pdf[target_name] print("Starting training...") # fit model to training data model = Pipeline([('scaler', MinMaxScaler()), ('xgb', XGBClassifier(eta=hyperparams["eta"], max_depth=hyperparams["max_depth"]))]) # xgboost saves feature names but lets store on pipeline for easy access later model.feature_names = feature_names model.target_name = target_name model.fit(X_train, y_train) print("Finished training") # export model artefacts joblib.dump(model, "artifacts/output/model.joblib") # we can also save as pmml so it can be used for In-Vantage scoring etc. xgboost_to_pmml(pipeline=model, col_names=feature_names, target_name=target_name, pmml_f_name="artifacts/output/model.pmml") print("Saved trained model") from xgboost import plot_importance model["xgb"].get_booster().feature_names = feature_names plot_importance(model["xgb"].get_booster(), max_num_features=10) save_plot("feature_importance.png") feature_importance = model["xgb"].get_booster().get_score(importance_type="weight") stats.record_stats(train_df, features=feature_names, predictors=["HasDiabetes"], categorical=["HasDiabetes"], importance=feature_importance, category_labels={"HasDiabetes": {0: "false", 1: "true"}})
def test_02_xgb_regressor(self): print("\ntest 02 (xgb regressor without preprocessing)\n") model = XGBRegressor() pipeline_obj = Pipeline([ ("model", model) ]) pipeline_obj.fit(self.X,self.Y) file_name = "test02xgboost.pmml" xgboost_to_pmml(pipeline_obj, self.features, 'Species', file_name) model_name = self.adapa_utility.upload_to_zserver(file_name) predictions, _ = self.adapa_utility.score_in_zserver(model_name, self.test_file) model_pred = pipeline_obj.predict(self.X) self.assertEqual(self.adapa_utility.compare_predictions(predictions, model_pred), True)
def train(data_conf, model_conf, **kwargs): hyperparams = model_conf["hyperParameters"] feature_names = [ "NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns", "BMI", "DiPedFunc", "Age" ] target_name = "HasDiabetes" # in a real world scenario, you would read from S3, HDFS, Teradata, # etc but for demo reading from url. we could read via pandas.read_csv but just to show pyspark ... urllib.request.urlretrieve(data_conf["url"], "/tmp/data.csv") all_columns = feature_names + [target_name] train_df = spark.read.format("csv")\ .option("inferSchema", "true")\ .load("/tmp/data.csv")\ .toDF(*all_columns) # do feature eng in spark / joins whatever reason you're using pyspark... # split into test and train train_df = train_df.randomSplit([0.7, 0.3], 42)[0].toPandas() # split data into X and y X_train = train_df.drop(target_name, 1) y_train = train_df[target_name] print("Starting training...") # fit model to training data model = Pipeline([('scaler', MinMaxScaler()), ('xgb', XGBClassifier(eta=hyperparams["eta"], max_depth=hyperparams["max_depth"]))]) # xgboost saves feature names but lets store on pipeline for easy access model.feature_names = feature_names model.fit(X_train, y_train) print("Finished training") # export model artefacts joblib.dump(model, "artifacts/output/model.joblib") # we can also save as pmml so it can be used for In-Vantage scoring etc. xgboost_to_pmml(pipeline=model, col_names=feature_names[0:8], target_name=target_name, pmml_f_name="artifacts/output/model.pmml") print("Saved trained model")
def test_xgboost_05(self): iris = datasets.load_iris() irisd = pd.DataFrame(iris.data, columns=iris.feature_names) irisd['target'] = [i % 2 for i in range(iris.data.shape[0])] features = irisd.columns.drop('target') target = 'target' pipeline_obj = Pipeline([('lgbmc', XGBClassifier())]) pipeline_obj.fit(irisd[features], irisd[target]) xgboost_to_pmml(pipeline_obj, features, target, "xgbc_bin_pmml.pmml") self.assertEqual(os.path.isfile("xgbc_bin_pmml.pmml"), True)
def test_xgboost_01(self): iris = datasets.load_iris() irisd = pd.DataFrame(iris.data, columns=iris.feature_names) irisd['Species'] = iris.target features = irisd.columns.drop('Species') target = 'Species' pipeline_obj = Pipeline([('lgbmc', XGBClassifier())]) pipeline_obj.fit(irisd[features], irisd[target]) xgboost_to_pmml(pipeline_obj, features, target, "xgbc_pmml.pmml") self.assertEqual(os.path.isfile("xgbc_pmml.pmml"), True)
def test_03_xgb_classifier(self): print("\ntest 03 (xgb classifier with preprocessing) [binary-class]\n") model = XGBClassifier() pipeline_obj = Pipeline([ ('scaler',MinMaxScaler()), ("model", model) ]) pipeline_obj.fit(self.X,self.Y_bin) file_name = "test03xgboost.pmml" xgboost_to_pmml(pipeline_obj, self.features, 'Species', file_name) model_name = self.adapa_utility.upload_to_zserver(file_name) predictions, probabilities = self.adapa_utility.score_in_zserver(model_name, self.test_file) model_pred = pipeline_obj.predict(self.X) model_prob = pipeline_obj.predict_proba(self.X) self.assertEqual(self.adapa_utility.compare_predictions(predictions, model_pred), True) self.assertEqual(self.adapa_utility.compare_probability(probabilities, model_prob), True)
def train(data_conf, model_conf, **kwargs): hyperparams = model_conf["hyperParameters"] create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"]) feature_names = [ "NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns", "BMI", "DiPedFunc", "Age" ] target_name = "HasDiabetes" # read training dataset from Teradata and convert to pandas train_df = DataFrame(data_conf["table"]) train_df = train_df.select([feature_names + [target_name]]) train_df = train_df.to_pandas() # split data into X and y X_train = train_df.drop(target_name, 1) y_train = train_df[target_name] print("Starting training...") # fit model to training data model = Pipeline([('scaler', MinMaxScaler()), ('xgb', XGBClassifier(eta=hyperparams["eta"], max_depth=hyperparams["max_depth"]))]) # xgboost saves feature names but lets store on pipeline for easy access later model.feature_names = feature_names model.target_name = target_name model.fit(X_train, y_train) print("Finished training") # export model artefacts joblib.dump(model, "artifacts/output/model.joblib") # we can also save as pmml so it can be used for In-Vantage scoring etc. xgboost_to_pmml(pipeline=model, col_names=feature_names, target_name=target_name, pmml_f_name="artifacts/output/model.pmml") print("Saved trained model")
def test_xgboost_02(self): auto = pd.read_csv('nyoka/tests/auto-mpg.csv') X = auto.drop(['mpg','car name'], axis=1) y = auto['mpg'] feature_names = [name for name in auto.columns if name not in ('mpg','car name')] target_name='mpg' pipeline_obj = Pipeline([ ('lgbmr',XGBRegressor()) ]) pipeline_obj.fit(auto[feature_names],auto[target_name]) xgboost_to_pmml(pipeline_obj,feature_names,target_name,"xgbr_pmml.pmml") self.assertEqual(os.path.isfile("xgbr_pmml.pmml"),True)
def test_xgboost_06(self): iris = datasets.load_iris() irisd = pd.DataFrame(iris.data, columns=iris.feature_names) irisd['Species'] = iris.target features = irisd.columns.drop('Species') target = 'Species' f_name = "xgbc_pmml.pmml" model = XGBClassifier() model.fit(irisd[features], irisd[target]) with self.assertRaises(TypeError): xgboost_to_pmml(model, features, target, f_name, model_name="testModel")
def test_xgboost_03(self): iris = datasets.load_iris() irisd = pd.DataFrame(iris.data, columns=iris.feature_names) irisd['Species'] = iris.target features = irisd.columns.drop('Species') target = 'Species' pipeline_obj = Pipeline([('scaling', StandardScaler()), ('LGBMC_preprocess', XGBClassifier(n_estimators=5))]) pipeline_obj.fit(irisd[features], irisd[target]) xgboost_to_pmml(pipeline_obj, features, target, "xgbc_pmml_preprocess.pmml") self.assertEqual(os.path.isfile("xgbc_pmml_preprocess.pmml"), True)
def train(data_conf, model_conf, **kwargs): hyperparams = model_conf["hyperParameters"] feature_names = [ "NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns", "BMI", "DiPedFunc", "Age" ] target_name = "HasDiabetes" train_df = read_dataframe(spark, data_conf["url"]) # do feature eng in spark / joins whatever reason you're using pyspark... # split into test and train train_df = train_df.randomSplit([0.7, 0.3], 42)[0].toPandas() # split data into X and y X_train = train_df.drop(target_name, 1) y_train = train_df[target_name] print("Starting training...") # fit model to training data model = Pipeline([('scaler', MinMaxScaler()), ('xgb', XGBClassifier(eta=hyperparams["eta"], max_depth=hyperparams["max_depth"]))]) # xgboost saves feature names but lets store on pipeline for easy access model.feature_names = feature_names model.fit(X_train, y_train) print("Finished training") # export model artefacts joblib.dump(model, "artifacts/output/model.joblib") # we can also save as pmml so it can be used for In-Vantage scoring etc. xgboost_to_pmml(pipeline=model, col_names=feature_names[0:8], target_name=target_name, pmml_f_name="artifacts/output/model.pmml") print("Saved trained model")
random_state=seed) pipeline = Pipeline([('scaling', StandardScaler()), ('xgb', XGBClassifier(n_estimators=5, seed=seed))]) pipeline.fit(X_train, y_train) y_pred = pipeline.predict(X_test) y_pred_proba = pipeline.predict_proba(X_test) import pickle import numpy as np d = pickle.dumps(pipeline) saved_pipeline = pickle.loads(d) y_pred_saved = saved_pipeline.predict(X_test) y_pred_proba_saved = saved_pipeline.predict_proba(X_test) assert np.array_equal(y_pred, y_pred_saved), "Not equal after saved" assert np.array_equal(y_pred_proba_saved, y_pred_proba), "Not equal after saved" from nyoka import xgboost_to_pmml xgboost_to_pmml(saved_pipeline, features, target, "xgb-iris.pmml") from pypmml import Model model = Model.fromFile("xgb-iris.pmml") y_pred_pmml = model.predict(X_test) assert np.array_equal( y_pred, y_pred_pmml["predicted_Species"]), "Not equal after saved"
def test_xgboost_03(self): iris = datasets.load_iris() irisd = pd.DataFrame(iris.data, columns=iris.feature_names) irisd['Species'] = iris.target features = irisd.columns.drop('Species') target = 'Species' f_name = "xgbc_pmml_preprocess.pmml" model = XGBClassifier(n_estimators=5) pipeline_obj = Pipeline([('scaling', StandardScaler()), ('xgbc', model)]) pipeline_obj.fit(irisd[features], irisd[target]) xgboost_to_pmml(pipeline_obj, features, target, f_name) pmml_obj = pml.parse(f_name, True) pmml_value_list = [] model_value_list = [] pmml_score_list = [] model_score_list = [] list_seg_score1 = [] list_seg_score2 = [] list_seg_score3 = [] list_seg_val1 = [] list_seg_val2 = [] list_seg_val3 = [] get_nodes_in_json_format = [] for i in range(model.n_estimators * model.n_classes_): get_nodes_in_json_format.append( json.loads(model._Booster.get_dump(dump_format='json')[i])) n = 1 for i in range(len(get_nodes_in_json_format)): list_score_temp = [] list_val_temp = [] node_list = get_nodes_in_json_format[i] if n == 1: n = 2 self.create_node(node_list, list_score_temp, list_val_temp) list_seg_score1 = list_seg_score1 + list_score_temp list_seg_val1 = list_seg_val1 + list_val_temp list_val_temp.clear() list_score_temp.clear() elif n == 2: n = 3 self.create_node(node_list, list_score_temp, list_val_temp) list_seg_score2 = list_seg_score2 + list_score_temp list_seg_val2 = list_seg_val2 + list_val_temp list_val_temp.clear() list_score_temp.clear() elif n == 3: n = 1 self.create_node(node_list, list_score_temp, list_val_temp) list_seg_score3 = list_seg_score3 + list_score_temp list_seg_val3 = list_seg_val3 + list_val_temp list_val_temp.clear() list_score_temp.clear() model_score_list = list_seg_score1 + list_seg_score2 + list_seg_score3 model_value_list = list_seg_val1 + list_seg_val2 + list_seg_val3 seg_tab = pmml_obj.MiningModel[0].Segmentation.Segment for seg in seg_tab: if int(seg.id) <= 3: for segment in seg.MiningModel.Segmentation.Segment: node_tab = segment.TreeModel.Node.Node if not node_tab: pmml_score_list.append(segment.TreeModel.Node.score) else: for node in node_tab: varlen = node.get_Node().__len__() if varlen > 0: pmml_value_list.append( node.SimplePredicate.value) self.extractValues(node, pmml_value_list, pmml_score_list) else: pmml_value_list.append( node.SimplePredicate.value) pmml_score_list.append(node.score) ##1 for model_val, pmml_val in zip(model_score_list, pmml_score_list): self.assertEqual(model_val, float(pmml_val)) ##2 for model_val, pmml_val in zip(model_value_list, pmml_value_list): self.assertEqual(model_val, pmml_val) ##3 self.assertEqual(os.path.isfile(f_name), True)
def test_xgboost_04(self): auto = pd.read_csv('nyoka/tests/auto-mpg.csv') X = auto.drop(['mpg'], axis=1) y = auto['mpg'] feature_names = [name for name in auto.columns if name not in 'mpg'] f_name = "xgbr_pmml_preprocess2.pmml" target_name = 'mpg' x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101) model = XGBRegressor() pipeline_obj = Pipeline([ ('mapper', DataFrameMapper([('car name', CountVectorizer()), (['displacement'], [StandardScaler()])])), ('xgbr', model) ]) pipeline_obj.fit(x_train, y_train) xgboost_to_pmml(pipeline_obj, feature_names, target_name, f_name) pmml_obj = pml.parse(f_name, True) pmml_value_list = [] model_value_list = [] pmml_score_list = [] model_score_list = [] seg_tab = pmml_obj.MiningModel[0].Segmentation.Segment for seg in seg_tab: for node in seg.TreeModel.Node.Node: varlen = node.get_Node().__len__() if varlen > 0: pmml_value_list.append(node.SimplePredicate.value) self.extractValues(node, pmml_value_list, pmml_score_list) else: pmml_value_list.append(node.SimplePredicate.value) pmml_score_list.append(node.score) get_nodes_in_json_format = [] for i in range(model.n_estimators): get_nodes_in_json_format.append( json.loads(model._Booster.get_dump(dump_format='json')[i])) for i in range(len(get_nodes_in_json_format)): list_score_temp = [] list_val_temp = [] node_list = get_nodes_in_json_format[i] self.create_node(node_list, list_score_temp, list_val_temp) model_score_list = model_score_list + list_score_temp model_value_list = model_value_list + list_val_temp list_val_temp.clear() list_score_temp.clear() ##1 for model_val, pmml_val in zip(model_score_list, pmml_score_list): self.assertEqual(model_val, float(pmml_val)) ##2 for model_val, pmml_val in zip(model_value_list, pmml_value_list): self.assertEqual(model_val, pmml_val) ##3 self.assertEqual(os.path.isfile(f_name), True)
def test_xgboost_05(self): iris = datasets.load_iris() irisd = pd.DataFrame(iris.data, columns=iris.feature_names) irisd['target'] = [i % 2 for i in range(iris.data.shape[0])] features = irisd.columns.drop('target') target = 'target' f_name = "xgbc_bin_pmml.pmml" model = XGBClassifier(min_child_weight=6, n_estimators=10, scale_pos_weight=10, deterministic_histogram=False) pipeline_obj = Pipeline([('xgbc', model)]) pipeline_obj.fit(irisd[features], irisd[target]) xgboost_to_pmml(pipeline_obj, features, target, f_name) pmml_obj = pml.parse(f_name, True) pmml_value_list = [] model_value_list = [] pmml_score_list = [] model_score_list = [] seg_tab = pmml_obj.MiningModel[0].Segmentation.Segment for seg in seg_tab: if int(seg.id) == 1: for segment in seg.MiningModel.Segmentation.Segment: node_tab = segment.TreeModel.Node.Node if not node_tab: pmml_score_list.append(segment.TreeModel.Node.score) else: for node in node_tab: varlen = node.get_Node().__len__() if varlen > 0: pmml_value_list.append( node.SimplePredicate.value) self.extractValues(node, pmml_value_list, pmml_score_list) else: pmml_value_list.append( node.SimplePredicate.value) pmml_score_list.append(node.score) get_nodes_in_json_format = [] for i in range(model.n_estimators): get_nodes_in_json_format.append( json.loads(model._Booster.get_dump(dump_format='json')[i])) for i in range(len(get_nodes_in_json_format)): list_score_temp = [] list_val_temp = [] node_list = get_nodes_in_json_format[i] self.create_node(node_list, list_score_temp, list_val_temp) model_score_list = model_score_list + list_score_temp model_value_list = model_value_list + list_val_temp list_val_temp.clear() list_score_temp.clear() ##1 for model_val, pmml_val in zip(model_score_list, pmml_score_list): self.assertEqual(model_val, float(pmml_val)) ##2 for model_val, pmml_val in zip(model_value_list, pmml_value_list): self.assertEqual(model_val, pmml_val) ##3 self.assertEqual(os.path.isfile(f_name), True)