def build_audit(classifier, name, **pmml_options): if isinstance(classifier, LGBMClassifier): cat_columns = ["Age", "Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"] cont_columns = ["Income", "Hours"] else: cat_columns = ["Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"] cont_columns = ["Age", "Income", "Hours"] if isinstance(classifier, LGBMClassifier): cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns] else: cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns] cont_mappings = [([cont_column], cont_domain(name)) for cont_column in cont_columns] mapper = DataFrameMapper(cat_mappings + cont_mappings) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ]) if isinstance(classifier, LGBMClassifier): pipeline.fit(audit_X, audit_y, classifier__categorical_feature = [0, 1, 2, 3, 4, 5]) elif isinstance(classifier, XGBClassifier): if name == "XGBoostAuditNA": audit_X["Age"] = audit_X["Age"].astype(float) pipeline.fit(audit_X, audit_y) else: pipeline.fit(audit_X, audit_y) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_X.sample(n = 3, random_state = 13)) pipeline.configure(**pmml_options) store_pmml(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"]) adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"]) store_csv(pandas.concat((adjusted, adjusted_proba), axis = 1), name)
def build_sentiment(classifier, name, with_proba=True, **pmml_options): pipeline = PMMLPipeline([ ("tf-idf", TfidfVectorizer( analyzer="word", preprocessor=None, strip_accents=None, lowercase=True, token_pattern=None, tokenizer=Splitter(), stop_words="english", ngram_range=(1, 2), norm=None, dtype=(numpy.float32 if isinstance( classifier, RandomForestClassifier) else numpy.float64))), ("selector", SelectKBest(f_classif, k=500)), ("classifier", classifier) ]) pipeline.fit(sentiment_X, sentiment_y) pipeline.configure(**pmml_options) store_pkl(pipeline, name + ".pkl") score = DataFrame(pipeline.predict(sentiment_X), columns=["Score"]) if with_proba == True: score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns=["probability(0)", "probability(1)"]) score = pandas.concat((score, score_proba), axis=1) store_csv(score, name + ".csv")
def build_iris(classifier, name, **pmml_options): cont_columns = [ "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width" ] cont_mappings = [([cont_column], ContinuousDomain()) for cont_column in cont_columns] mapper = DataFrameMapper(cont_mappings) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(iris_X, iris_y) if isinstance(classifier, XGBClassifier): pipeline.verify(iris_X.sample(n=3, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(iris_X.sample(n=3, random_state=13)) pipeline.configure(**pmml_options) store_pmml(pipeline, name) species = DataFrame(pipeline.predict(iris_X), columns=["Species"]) species_proba = DataFrame(pipeline.predict_proba(iris_X), columns=[ "probability(setosa)", "probability(versicolor)", "probability(virginica)" ]) store_csv(pandas.concat((species, species_proba), axis=1), name)
def submodel_evaluation(train_data,valid_data,model_list,\ category_feature,numeric_feature): X_train = train_data[category_feature+numeric_feature] y_train = train_data['user_type'] X_valid = valid_data[category_feature+numeric_feature] y_valid = valid_data['user_type'] pipeline_transformer = feature_union(category_feature,numeric_feature) model_result_dict = {} for model in model_list: model_name = model.__class__.__name__ print('model %s evaluation'%model_name) sub_model = PMMLPipeline([ ('mapper',pipeline_transformer), ('classifier',model) ]) sub_model.fit(X_train,y_train) predict_valid = sub_model.predict_proba(X_valid)[:,1] predict_label = sub_model.predict(X_valid) model_ks = plot_ks_curve(predict_valid,valid_data['user_type']) model_auc = roc_auc_score(y_valid, predict_valid) accuracy = metrics.accuracy_score(y_valid,predict_label) model_result_dict[model_name] = [model_ks,model_auc,accuracy] return model_result_dict
def build_iris_vec(classifier, name): pipeline = PMMLPipeline([ ("classifier", classifier) ]) pipeline.fit(iris_X, iris_y) store_pmml(pipeline, name) species = DataFrame(pipeline.predict(iris_X), columns = ["Species"]) species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"]) store_csv(pandas.concat((species, species_proba), axis = 1), name)
def build_audit(mapper, classifier, name, **pmml_options): pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_X, audit_y) pipeline.configure(**pmml_options) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"]) adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name)
def build_apollo(mapper, name): pipeline = PMMLPipeline([("mapper", mapper), ("classifier", DecisionTreeClassifier())]) pipeline.fit(df, df["success"]) store_pkl(pipeline, name) success = DataFrame(pipeline.predict(df), columns=["success"]) success_proba = DataFrame( pipeline.predict_proba(df), columns=["probability(false)", "probability(true)"]) success = pandas.concat((success, success_proba), axis=1) store_csv(success, name)
def test_predict_proba_transform(self): predict_proba_transformer = FunctionTransformer(numpy.log) pipeline = PMMLPipeline([("estimator", DummyClassifier(strategy = "prior"))], predict_proba_transformer = predict_proba_transformer) X = DataFrame([1.0, 1.0, 1.0, 1.0, 1.0, 1.0], columns = ["x"]) y = Series(["green", "red", "yellow", "green", "red", "green"], name = "y") pipeline.fit(X, y) self.assertEqual(["green", "red", "yellow"], pipeline._final_estimator.classes_.tolist()) y_proba = [3 / 6.0, 2 / 6.0, 1 / 6.0] y_probat = [numpy.log(x) for x in y_proba] self.assertEqual([y_proba for i in range(0, 6)], pipeline.predict_proba(X).tolist()) self.assertEqual([y_proba + y_probat for i in range(0, 6)], pipeline.predict_proba_transform(X).tolist())
def build_audit_dict(classifier, name, with_proba = True): pipeline = PMMLPipeline([ ("dict-transformer", DictVectorizer()), ("classifier", classifier) ]) pipeline.fit(audit_dict_X, audit_y) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_dict_X), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_dict_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def build_iris_opt(classifier, name, fit_params = {}, **pmml_options): pipeline = PMMLPipeline([ ("classifier", classifier) ]) pipeline.fit(iris_X[iris_train_mask], iris_y[iris_train_mask], **fit_params) if isinstance(classifier, XGBClassifier): pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13)) store_pkl(pipeline, name) species = DataFrame(pipeline.predict(iris_X), columns = ["Species"]) species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"]) species = pandas.concat((species, species_proba), axis = 1) store_csv(species, name)
def build_sentiment(classifier, transformer, name, with_proba = True, **pmml_options): pipeline = PMMLPipeline([ ("transformer", transformer), ("densifier", DenseTransformer()), ("selector", SelectKBest(f_classif, k = 500)), ("classifier", classifier) ]) pipeline.fit(sentiment_X, sentiment_y) pipeline.configure(**pmml_options) store_pmml(pipeline, name) score = DataFrame(pipeline.predict(sentiment_X), columns = ["Score"]) if with_proba: score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns = ["probability(0)", "probability(1)"]) score = pandas.concat((score, score_proba), axis = 1) store_csv(score, name)
def build_versicolor_direct(classifier, name, with_proba=True, **pmml_options): transformer = ColumnTransformer( [("all", "passthrough", ["Petal.Length", "Petal.Width"])], remainder="drop") pipeline = PMMLPipeline([("transformer", transformer), ("classifier", classifier)]) pipeline.fit(versicolor_X, versicolor_y) pipeline.configure(**pmml_options) pipeline.verify(versicolor_X.sample(frac=0.10, random_state=13)) store_pkl(pipeline, name) species = DataFrame(pipeline.predict(versicolor_X), columns=["Species"]) if with_proba == True: species_proba = DataFrame(pipeline.predict_proba(versicolor_X), columns=["probability(0)", "probability(1)"]) species = pandas.concat((species, species_proba), axis=1) store_csv(species, name)
def build_audit_na_direct(classifier, name): mapper = DataFrameMapper([ (["Age", "Hours", "Income"], None), (["Employment", "Education", "Marital", "Occupation", "Gender"], OneHotEncoder()) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ]) pipeline.fit(audit_na_X, audit_na_y) pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"]) adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def build_audit_na_hist(classifier, name): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] + [([column], [CategoricalDomain(), PMMLLabelBinarizer()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ]) pipeline.fit(audit_na_X, audit_na_y) pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"]) adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def train_model(X: DataFrame, y: Series, attr_preprocs, pmml_fname, n_estimators): pipeline = PMMLPipeline([ ("attribute_preprocessor", DataFrameMapper(attr_preprocs)), ('classifier', XGBClassifier(n_gpus=0, objective="binary:logistic", n_jobs=30, max_depth=2, n_estimators=n_estimators, colsample_bytree=0.5, colsample_bylevel=0.5, colsample_bynode=0.5, subsample=0.5, reg_alpha=0.8, reg_lambda=2, missing=-99998)) ]) pipeline.fit(X, y) prob = pipeline.predict_proba(X) fpr, tpr, thresholds = metrics.roc_curve(y, prob[:, 1], pos_label=1) auc = metrics.auc(fpr, tpr) ks = np.max(tpr - fpr) sklearn2pmml(pipeline, pmml_fname, with_repr=True) return pipeline, auc, ks
def build_audit_dict(classifier, name, with_proba = True): header = { "copyright" : "Copyright (c) 2021 Villu Ruusmann", "description" : "Integration test for dictionary (key-value mappings) input", "modelVersion" : "1.0.0" } pipeline = PMMLPipeline([ ("dict-transformer", DictVectorizer()), ("classifier", classifier) ], header = header) pipeline.fit(audit_dict_X, audit_y) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_dict_X), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_dict_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def build_sentiment(classifier, tokenizer, name, with_proba = True, **pmml_options): pipeline = PMMLPipeline([ ("union", FeatureUnion([ ("tf-idf", TfidfVectorizer(analyzer = "word", preprocessor = None, strip_accents = None, lowercase = True, tokenizer = tokenizer, stop_words = "english", ngram_range = (1, 2), norm = None, sublinear_tf = isinstance(classifier, LogisticRegressionCV), dtype = (numpy.float32 if isinstance(classifier, RandomForestClassifier) else numpy.float64))), ("count", WordCountTransformer()) ])), ("selector", SelectKBest(f_classif, k = 1000)), ("classifier", classifier) ]) pipeline.fit(sentiment_X, sentiment_y) pipeline.configure(**pmml_options) store_pkl(pipeline, name) score = DataFrame(pipeline.predict(sentiment_X), columns = ["Score"]) if with_proba == True: score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns = ["probability(0)", "probability(1)"]) score = pandas.concat((score, score_proba), axis = 1) store_csv(score, name)
def build_audit_na(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, predict_transformer = None, predict_proba_transformer = None, apply_transformer = None, **pmml_options): employment_mapping = { "CONSULTANT" : "PRIVATE", "PSFEDERAL" : "PUBLIC", "PSLOCAL" : "PUBLIC", "PSSTATE" : "PUBLIC", "SELFEMP" : "PRIVATE", "PRIVATE" : "PRIVATE" } gender_mapping = { "FEMALE" : 0.0, "MALE" : 1.0, "MISSING_VALUE" : 0.5 } mapper = DataFrameMapper( [(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999", dtype = int), name = "flag_missing(Age, -999)"), SimpleImputer(missing_values = -999, strategy = "constant", fill_value = 38)])] + [(["Age"], MissingIndicator())] + [(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), SimpleImputer(missing_values = -999, add_indicator = True)])] + [(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), SimpleImputer(strategy = "median", add_indicator = True)])] + [(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] + [([column], [CategoricalDomain(missing_values = None, missing_value_replacement = "N/A", with_data = False), SimpleImputer(missing_values = "N/A", strategy = "most_frequent"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), SimpleImputer(strategy = "constant"), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ], predict_transformer = predict_transformer, predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer) pipeline.fit(audit_na_X, audit_na_y, **fit_params) pipeline.configure(**pmml_options) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1) store_csv(adjusted, name)
def build_audit_na(classifier, name, with_proba = True, predict_proba_transformer = None, apply_transformer = None, **pmml_options): employment_mapping = { "CONSULTANT" : "PRIVATE", "PSFEDERAL" : "PUBLIC", "PSLOCAL" : "PUBLIC", "PSSTATE" : "PUBLIC", "SELFEMP" : "PRIVATE", "PRIVATE" : "PRIVATE" } gender_mapping = { "FEMALE" : 0, "MALE" : 1 } mapper = DataFrameMapper( [(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999"), name = "flag_missing(Age, -999)"), Imputer(missing_values = -999)])] + [(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), Imputer(missing_values = -999)])] + [(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), Imputer()])] + [(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] + [([column], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ], predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer) pipeline.fit(audit_na_X, audit_na_y) pipeline.configure(**pmml_options) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1) store_csv(adjusted, name + ".csv")
(iris_X.columns.values, ContinuousDomain()) ])), ("classifier", SelectFirstClassifier([ ("select", Pipeline([ ("classifier", DecisionTreeClassifier(random_state = 13)) ]), "X[1] <= 3"), ("default", Pipeline([ ("scaler", StandardScaler()), ("classifier", LogisticRegression(multi_class = "ovr", solver = "liblinear")) ]), str(True)) ])) ]) pipeline.fit(iris_X, iris_y) store_pkl(pipeline, "SelectFirstIris") species = DataFrame(pipeline.predict(iris_X), columns = ["Species"]) species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"]) species = pandas.concat((species, species_proba), axis = 1) store_csv(species, "SelectFirstIris") if "Iris" in datasets: classifier = RuleSetClassifier([ ("X['Petal.Length'] >= 2.45 and X['Petal.Width'] < 1.75", "versicolor"), ("X['Petal.Length'] >= 2.45", "virginica") ], default_score = "setosa") pipeline = PMMLPipeline([ ("classifier", classifier) ]) pipeline.fit(iris_X, iris_y) pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13)) store_pkl(pipeline, "RuleSetIris") species = DataFrame(pipeline.predict(iris_X), columns = ["Species"])
from sklearn2pmml.preprocessing import PMMLLabelEncoder from sklearn2pmml.feature_extraction.text import Splitter from sklearn_pandas import DataFrameMapper binary = False data = pd.read_csv("test/support/mpg.csv") if binary: data["drv"] = data["drv"].replace("r", "4") numeric_features = ["displ", "year", "cyl"] categorical_features = ["class"] text_features = [] mapper = DataFrameMapper( [(numeric_features, [ContinuousDomain()])] + [([f], [CategoricalDomain(), PMMLLabelEncoder()]) for f in categorical_features] + [(f, [CategoricalDomain(), CountVectorizer(tokenizer=Splitter())]) for f in text_features]) pipeline = PMMLPipeline([("mapper", mapper), ("model", LGBMClassifier(n_estimators=1000))]) pipeline.fit(data, data["drv"], model__categorical_feature=[3]) suffix = "binary" if binary else "multiclass" sklearn2pmml(pipeline, "test/support/python/lightgbm_" + suffix + ".pmml") print(list(pipeline.predict(data[:10]))) print(list(pipeline.predict_proba(data[0:1])[0]))
class AutoBuilder: """' E2E classifier builder Builds binary classifier, including: - dataset EDA (optional) - hyperparameter tuning (optional) - model performance assessment - SHAP-based feature analysis - feature selection - creating deployment package (pmml & pkl) Attributes: auto_build (method): automatically builds bin populates output_dir path, with model artifacts, and evaluation charts """ def __init__( self, output_dir_path, csv_path, target_col="target", ignore_cols=[], eda_flag=True, tune_flag=True, cardinality_threshold=100, shap_plot_num=10, shap_frac=0.05, importance_cutoff=0.00, corr_cutoff=0.9, search_space=LGB_SEARCH_SPACE, tuning_iters=25, lgb_params={}, random_state=1234, ): """ Args: output_dir_path (string): filepath where outputs package is created and saved csv_path (string): filepath to input csv, NOTE need to preprocess columns to be numeric or string type target_col (string, optional): target column, default 'target' ignore_cols (iterable, optional): columns to be dropped, default [] eda_flag (boolean, optional): EDA plots to be generated, default True tune_flag (boolean, optional): Lightgbm hyperparameters to be tuned, default True shap_plot_num (numeric, optional): Generate SHAP dependency plots for N most important features, default 10 shap_frac (numeric, optional): Proportion of data sampled for SHAP analysis, default 5% importance_cutoff (numeric, optional): Abs. avg. SHAP value threshold suggest dropping feature, default 0.00 corr_cutoff (numeric, optional): Abs. avg. correlation suggest dropping feature, default 0.9 search_space (numeric, optional): Tuning space for Bayesian optimisation, default is SKOPT_SEARCH_SPACE tuning_iter (numeric, optional): number of tuning iterations for Bayesian optimisation, default is 25, lgb_params (dict, optional): Hyperparams to use in case when tune_flag = False, default None random_state (numeric, optional): Random seed for train test split, and model-training - default is 1234 """ self.output_dir_path = output_dir_path self.csv_path = csv_path self.target_col = target_col self.ignore_cols = ignore_cols self.eda_flag = eda_flag self.tune_flag = tune_flag self.cardinality_threshold = cardinality_threshold self.shap_plot_num = shap_plot_num self.shap_frac = shap_frac self.importance_cutoff = importance_cutoff self.corr_cutoff = corr_cutoff self.search_space = search_space self.tuning_iters = tuning_iters self.lgb_params = lgb_params self.random_state = random_state def _gen_model_dir(self): """ Creates output directory according to self.output_dir_path, removing previous output if there. Also makes subdirectories /bin /plots """ logger.info(f"building directory {self.csv_path}") if os.path.exists(self.output_dir_path) and os.path.isdir( self.output_dir_path): shutil.rmtree(self.output_dir_path) os.mkdir(self.output_dir_path) os.mkdir(self.output_dir_path + "/bin") os.mkdir(self.output_dir_path + "/plots") def _process_csv(self): """ Parses csv specified in self.csv_path, saving to self.raw Also - Drops ignore columns - Validates target and feature columns Target = binary, 0-1 Features = numeric or string """ logger.info(f"loading file {self.csv_path}") raw = pd.read_csv(self.csv_path).drop(columns=self.ignore_cols) logger.info("checking valid input data") assert raw[self.target_col].isna().sum() == 0 assert list(sorted(raw[self.target_col].unique())) == [0, 1] valid_shape = raw.select_dtypes( include=["int64", "float64", "object"]).shape assert valid_shape == raw.shape self.raw = raw raw.to_csv(f"{self.output_dir_path}/bin/raw.csv") def _prepare_X_y(self): """ Splits self raw into X_train y_train, X_test, y_test Also records categorical and numerical columns, and saves csv of training set """ y = self.raw[self.target_col] X = self.raw.drop(columns=self.target_col) logger.info("train test split") self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( X, y, test_size=0.20, random_state=self.random_state) data_train = self.X_train.copy() data_train["target"] = self.y_train training_data_path = f"{self.output_dir_path}/bin/train.csv" data_train.to_csv(training_data_path, index=False) del X, y def _create_categorical_transformer(self): self.categorical_cols = self.X_train.select_dtypes( include=["object"]).columns self.numeric_cols = self.X_train.select_dtypes( include=["int64", "float64"]).columns self.mapper = DataFrameMapper( [([cat_column], [CategoricalDomain(), LabelEncoder()]) for cat_column in self.categorical_cols] + [(self.numeric_cols, ContinuousDomain())]) # hacky, also storing seperated X_train_encoded and classifier, because couldn't get SHAP and skopt to work for e2e pipeline self.X_train_encoded = self.mapper.fit_transform(self.X_train) self.var_names = self.X_train.columns def _tune(self): """ Explores tuning space, updating self.lgb_params with values that minimize cross-validated brier score """ # todo, can I save memory, code and possibly tune binning strats by passing unencoded X_train into pipeline? logger.info(f"tuning {self.tuning_iters}") results = utils.bayes_hyperparam_tune( model=lgb.LGBMClassifier(objective="binary"), X=self.X_train_encoded, y=self.y_train, search_space=self.search_space, n_iters=self.tuning_iters, ) self.lgb_params = results.best_params_ logger.info(f"best params {self.lgb_params}") def _save_model(self): """ Saves sklearn pipeline as pkl and pmml files, also saves training file Args: pipeline (lightgbm pipeline) model to be saved output_dir (string): path to save model outputs train (df): dataset to save """ pmml_path = f"{self.output_dir_path}/model-pmml.pmml" pkl_path = f"{self.output_dir_path}/model-bin.pkl" pickle.dump(self.pipeline, open(pkl_path, "wb")) # sklearn2pmml(self.pipeline, pmml_path) def _generate_shap_plots(self): classifier = lgb.LGBMClassifier(**self.lgb_params) classifier.fit(self.X_train_encoded, self.y_train) X_shap = pd.DataFrame(data=self.X_train_encoded, columns=self.var_names) self.feature_importance = utils.create_shap_plots( classifier, X_shap, output_dir=self.output_dir_path, N=self.shap_plot_num, frac=self.shap_frac, ) def auto_build(self): """ Populates output_dir path, with model artifacts, and evalution charts """ self._gen_model_dir() self._process_csv() self._prepare_X_y() if self.eda_flag: logger.info("EDA") utils.dataset_eda(data=self.X_train, output_dir=self.output_dir_path) self._create_categorical_transformer() if self.tune_flag: self._tune() self._generate_shap_plots() logger.info("creating pipeline") classifier = lgb.LGBMClassifier(**self.lgb_params) self.pipeline = PMMLPipeline([("mapper", self.mapper), ("classifier", classifier)]) self.pipeline.fit(self.X_train, self.y_train) logger.info("Assessing model") y_pred = self.pipeline.predict_proba(self.X_test)[:, 1] y_bm = np.repeat(self.y_train.mean(), self.y_test.shape) utils.evaluate_model(self.y_test, y_pred, y_bm, self.output_dir_path, "Model") logger.info("suggeting features to remove") self.cols_to_remove = utils.find_features_to_remove( importance=self.feature_importance, X=self.X_train, importance_cutoff=self.importance_cutoff, corr_threshold=self.corr_cutoff, ) logger.info(f"candidates to remove - {self.cols_to_remove}") logger.info(f"saving model \n{self.output_dir_path}") self._save_model() test_input = dict(self.X_test.iloc[0]) test_score = self.pipeline.predict_proba(self.X_test.head(1)) logger.info( f"test-case model inputs \n{ test_input } \n model score \n {test_score}" ) logger.info("done!")