def main( io_config = "/mnt/configs/io.yml", feature_config = "/mnt/configs/features.yml", models_config = "/mnt/configs/train.yml", feature_subset = True): """ Main function that loads config, sets up logging, and trains a model Args: io_config (str, optional): yaml config for paths to load data features_config (str, optional): yaml config for features to exclude models_config (str, optional): yaml config for LGBM boosting and training params feature_subset: load only the reduced set of features_to_keep in features.yml """ # Set IO Paths logger.info(f"Loading config {io_config}") config = parse_config(io_config) logger.info(f"Config: \n{config}") # Load Data columns = None if feature_subset: columns = parse_config(feature_config)["features_to_keep"] datasets = config["partitions_filenames"][:2]# train & dev only dfs_map = load_data_partitions(config["partitions_folder"], datasets, columns) X_train, y_train = split_data(dfs_map['train']) X_dev, y_dev = split_data(dfs_map['dev']) # TODO: Implement get n-unique users from dfs_map partitions # -- Add model class method to associate n_users with the model obj # Fill Missing Data [X_train, X_dev] = fill_missing_data([X_train, X_dev], missing_value = -999) # Load Model Parameters models_config = parse_config(models_config) boosting_params = models_config["boosting_params"] fit_params = models_config["fit_params"] fit_params["eval_set"] = [(X_dev, y_dev)] # Compute Class Imbalance compute_class_imbalance(y_train) # Train Model model = LightGBMModel(**boosting_params) model.do_fit(X_train, y_train, **fit_params) model.save_training_info(X_train) # Save Model model_name = f'lgbm_model_{dt.date.today()}' model.do_save(config['models_folder'], model_name)
def main( io_config = "/mnt/configs/io.yml", features_config = "/mnt/configs/features.yml", eval_config = '/mnt/configs/evaluate.yml' ): """Load serialized models and evaluate Args: io_config: filepaths to load and save data features_config: Used to define feature sets for each model eval_config: filepath base for saved serialized models """ # Load Configs: Paths, Features, Saved Model-filepaths # TODO: eval_config can be part of io_config, if it doesn't need more args logger.info(f"Loading configs...") io_cfg = parse_config(io_config) features_cfg = parse_config(features_config) eval_cfg = parse_config(eval_config) models_base_fpath = eval_cfg["models_fpath"] figures_fpath = io_cfg["figures_fpath"] feature_labels = features_cfg["feature_labels"] # Load Features & Targets columns = features_cfg["features_to_keep"] test_set = io_cfg["partitions_filenames"][2] dfs_map = load_data_partitions(io_cfg["partitions_folder"], [test_set], columns) X_test, y_test = split_data(dfs_map['test']) [X_test] = fill_missing_data([X_test], missing_value = -999) # Load final model model_name = "all" model_fpath = models_base_fpath.format(model_name=model_name) model = load_model(model_fpath) # Get predicted probabilities y_prob = model.do_predict(X_test) # Plot Shap Values for Best Model base_folder = Path(io_cfg["figures_fpath"]) output_folder = base_folder / 'shap/' shap_df = compute_feature_importances(model.model, X_test, feature_labels, output_folder) logging.info(f"\nComplete.")
def main(io_config="/mnt/configs/io.yml", feature_config="/mnt/configs/features.yml", feature_subset=True): """Main function that loads config, sets up logging, and runs predictions Args: io_config: config for file paths to load or save. feature_config: config for feature sets included in various models. feature_subset: bool. If true, uses features_to_keep from yml """ # Configure IO Paths logger.info(f"Loading config {io_config}") config = parse_config(io_config) # Load Data for dev set data_folder = config["partitions_folder"] filenames = config["partitions_filenames"] dev_filename = [filenames[1]] columns = None if feature_subset: columns = parse_config(feature_config)["features_to_keep"] dfs_map = load_data_partitions(data_folder, dev_filename, columns) X_eval, y_eval = split_data(dfs_map['dev']) logger.info(f"X_eval has shape = {X_eval.shape}") # Fill Missing Data [X_eval] = fill_missing_data([X_eval], missing_value=-999) # Load Model model = joblib.load(config["model_predict_fpath"]) #print(model.__dict__.keys()) # Predict logger.info("Predicting") scores = model.do_predict(X_eval)
def main(io_config="/mnt/configs/io.yml", features_config="/mnt/configs/features.yml", models_config="/mnt/configs/train.yml", feature_subset=True): """ Main function that loads config, sets up logging, and trains a model Args: io_config (str, optional): yaml config for paths to load data features_config (str, optional): yaml config for features to exclude models_config (str, optional): yaml config for LGBM boosting and training params feature_subset: load only the reduced set of features_to_keep in features.yml """ # Load IO Paths logger.info(f"Loading config {io_config}") io_cfg = parse_config(io_config) logger.info(f"Config: \n{io_cfg}") # Load Feature & Model Configs features_cfg = parse_config(features_config) models_cfg = parse_config(models_config) # Set Model Parameters boosting_params = models_cfg["boosting_params"] model_folder = io_cfg['models_folder'] # Load Data columns = None if feature_subset: columns = features_cfg["features_to_keep"] datasets = io_cfg["partitions_filenames"][:2] # train & dev only dfs_map = load_data_partitions(io_cfg["partitions_folder"], datasets, columns) X_train, y_train = split_data(dfs_map['train']) X_dev, y_dev = split_data(dfs_map['dev']) # TODO: Implement get n-unique users from dfs_map partitions # -- Add model class method to associate n_users with the model obj # Fill Missing Data [X_train, X_dev] = fill_missing_data([X_train, X_dev], missing_value=-999) # Train Multiple Models, defined by ModelSchemas Class ms = ModelSchemas(X_train.columns, features_cfg) for model_schema in ms.schemas: logging.info(f"\nBuilding model {model_schema['name']}...") # Output Filename model_fname = f"lgbm_model_{model_schema['name']}" # Set X and y subset_columns = model_schema['features'] target = model_schema["target"] X = X_train[subset_columns] y = y_train # Train & Save Models fit_params = models_cfg["fit_params"] fit_params["eval_set"] = [(X, y), (X_dev[subset_columns], y_dev)] fit_params["eval_names"] = ['train', 'dev'] if target == y_train.name: clf = LightGBMModel(**boosting_params) elif target == 'majority_class': dummy_params = {"strategy": "most_frequent"} # "stratified" clf = DummyModel(**dummy_params) else: raise ValueError(f"{target} specified in ModelSchemas \ does not match y_train: {y_train.name}") clf.do_fit(X, y, **fit_params) clf.save_training_info(X) # DummyClassifier doesn't automatically log the aucs if target == 'majority_class': logging.info(f'Train auc: {clf.get_auc(X, y)}.') logging.info( f'Eval auc: {clf.get_auc(X_dev[subset_columns], y_dev)}') clf.do_save(model_folder, model_fname)
def main(io_config="/mnt/configs/io.yml", features_config="/mnt/configs/features.yml", eval_config='/mnt/configs/evaluate.yml'): """Load serialized models and evaluate Args: io_config: filepaths to load and save data features_config: Used to define feature sets for each model eval_config: filepath base for saved serialized models """ # Load Configs: Paths, Features, Saved Model-filepaths # TODO: eval_config can be part of io_config, if it doesn't need more args logger.info(f"Loading configs...") io_cfg = parse_config(io_config) features_cfg = parse_config(features_config) eval_cfg = parse_config(eval_config) models_base_fpath = eval_cfg["models_fpath"] figures_fpath = io_cfg["figures_fpath"] # Load Data, subset features columns = features_cfg["features_to_keep"] test_set = io_cfg["partitions_filenames"][2] dfs_map = load_data_partitions(io_cfg["partitions_folder"], [test_set], columns) X_test, y_test = split_data(dfs_map['test']) # Fill missing data [X_test] = fill_missing_data([X_test], missing_value=-999) # Loop over models # -- Eval multiple trained models # -- Build Table 1 summary metrics # -- Plot ROC Comparison Figures ms = ModelSchemas(X_test.columns, features_cfg) # Multiple models model_metrics = {} # Table 1 roc_plot = {} # ROC Figure for model_schema in ms.schemas: X, y = get_model_features_target(model_schema, X_test, y_test) # Load the model object model_name = model_schema["name"] model_fpath = models_base_fpath.format(model_name=model_name) model = load_model(model_fpath) # Evaluate model performance y_prob = model.do_predict(X) evaluator = ModelEvaluator(y, y_prob) model_metrics[model_name] = evaluator.metrics # Log Classification Report target_names = ['low BAC', 'high BAC'] evaluator.log_classification_report(target_names) # Store for roc-comparison plot if model_name != "all_w_majority_class": roc_plot[model_name] = { "model_label": model_schema["plot_name"], "y_true": y, "y_prob": y_prob } # Build & Save Table 1 model_metrics_df = build_table_1(model_metrics) # Plot ROC-comparison plot plot_order = model_metrics_df["model_name"].tolist() plot_order.remove("all_w_majority_class") plot_ROC_comparison(roc_plot, plot_order, figures_fpath, save_plot=True) logging.info(f"\nEvaluate.py Complete.")