Esempio n. 1
0
def main(
    io_config = "/mnt/configs/io.yml", 
    feature_config = "/mnt/configs/features.yml",
    models_config = "/mnt/configs/train.yml",
    feature_subset = True):
    """ Main function that loads config, sets up logging, and trains a model

    Args:
        io_config (str, optional): yaml config for paths to load data
        features_config (str, optional): yaml config for features to exclude
        models_config (str, optional): yaml config for LGBM boosting and training params
        feature_subset: load only the reduced set of features_to_keep in features.yml
    """
    # Set IO Paths
    logger.info(f"Loading config {io_config}")
    config = parse_config(io_config)
    logger.info(f"Config: \n{config}")

    # Load Data
    columns = None
    if feature_subset:
        columns = parse_config(feature_config)["features_to_keep"]
    datasets = config["partitions_filenames"][:2]# train & dev only
    dfs_map = load_data_partitions(config["partitions_folder"], datasets, columns)
    X_train, y_train = split_data(dfs_map['train']) 
    X_dev, y_dev = split_data(dfs_map['dev'])
    # TODO: Implement get n-unique users from dfs_map partitions   
    # -- Add model class method to associate n_users with the model obj 
        
    # Fill Missing Data
    [X_train, X_dev] = fill_missing_data([X_train, X_dev], missing_value = -999)
    
    # Load Model Parameters
    models_config = parse_config(models_config)
    boosting_params = models_config["boosting_params"]
    fit_params = models_config["fit_params"]
    fit_params["eval_set"] = [(X_dev, y_dev)]
    
    # Compute Class Imbalance
    compute_class_imbalance(y_train)
    
    # Train Model
    model = LightGBMModel(**boosting_params)
    model.do_fit(X_train, y_train, **fit_params)
    model.save_training_info(X_train)
    
    # Save Model
    model_name = f'lgbm_model_{dt.date.today()}'
    model.do_save(config['models_folder'], model_name)
Esempio n. 2
0
def main(config):
    """Given lat-lon data for each BrAC observation,
    assign observations to clusters by the Kmeans algorithm,
    then format into a geojson, in order to visualize using MapBox API

    Args:
        config (dict): includes io filepaths & kmeans-kwargs
    """

    config = parse_config(config)
    input_fpath = config["input_fpath"]
    output_fpath = config["output_fpath"]
    km_params = config["km_params"]

    df = pd.read_csv(input_fpath, sep=',', header=0)
    logging.info(df.head(3))

    X = df[['longitude', 'latitude']]

    k = 120  # Number of clusters
    step = 1  # step--> increments k
    mod = 2  # modulo--> determines verbosity/logging frequency

    sum_squared_dist = find_optimal_k(k, X, mod, step, km_params)
    plot_kmeans_ssd(k, sum_squared_dist, step)

    km_params['n_clusters'] = k
    km = KMeans(**km_params).fit(X)
    logging.info(f'Fit model with {k} centroids')

    cluster_labels, centroids = get_centroids(km)
    features = make_centroid_geodata(centroids)
    geojson = make_geojson(features)

    save_geojson(k, geojson, output_fpath)
Esempio n. 3
0
def main(config_file="config.yml"):
    """
    Main function that loads config, sets up logging, and runs evaluation

    Args:
        config_file (str): path to config file (for logging)

    Returns:
        None
    """
    logger.info("Evaluating")
    # load config
    logger.info(f"Loading config {config_file}")
    config = parse_config(config_file)
    logger.info(f"Config: \n{config}")
    # log experiment
    params = {"param0": np.random.rand()}
    metrics = {"metric0": np.random.rand()}
    artifacts = {"config_file": config_file}
    log_experiment(
        params,
        metrics,
        artifacts,
        config["experiment_name"],
        os.environ["MLFLOW_TRACKING_URI"],
        os.environ["MLFLOW_ARTIFACT_LOCATION"],
    )
Esempio n. 4
0
def main(
    io_config = "/mnt/configs/io.yml", 
    features_config = "/mnt/configs/features.yml",
    eval_config = '/mnt/configs/evaluate.yml'
    ):
    """Load serialized models and evaluate

    Args:
        io_config: filepaths to load and save data
        features_config: Used to define feature sets for each model
        eval_config: filepath base for saved serialized models
    """
    
    # Load Configs:  Paths, Features, Saved Model-filepaths
    # TODO: eval_config can be part of io_config, if it doesn't need more args
    logger.info(f"Loading configs...")
    io_cfg = parse_config(io_config)
    features_cfg = parse_config(features_config)
    eval_cfg = parse_config(eval_config)
    models_base_fpath = eval_cfg["models_fpath"]
    figures_fpath = io_cfg["figures_fpath"]
    feature_labels = features_cfg["feature_labels"]
    
    # Load Features & Targets
    columns = features_cfg["features_to_keep"]
    test_set = io_cfg["partitions_filenames"][2]
    dfs_map = load_data_partitions(io_cfg["partitions_folder"], [test_set], columns)
    X_test, y_test = split_data(dfs_map['test']) 
    [X_test] = fill_missing_data([X_test], missing_value = -999)

	# Load final model
    model_name = "all"
    model_fpath = models_base_fpath.format(model_name=model_name)
    model = load_model(model_fpath)
    
    # Get predicted probabilities
    y_prob = model.do_predict(X_test)
                
    # Plot Shap Values for Best Model
    base_folder = Path(io_cfg["figures_fpath"])
    output_folder = base_folder / 'shap/'
    shap_df = compute_feature_importances(model.model, X_test, feature_labels, output_folder)
    
    logging.info(f"\nComplete.")
Esempio n. 5
0
def main(io_config="/mnt/configs/io.yml",
         features_config="/mnt/configs/features.yml"):

    # Load config for file-io
    io_config = parse_config(io_config)

    # Load config for feature subsetting
    features_config = parse_config(features_config)
    keep_features = features_config["features_to_keep"][
        2:]  # Exclude label & user_id
    labels_dict = features_config["feature_labels"]
    feature_labels = [labels_dict[feat]["label"] for feat in keep_features]

    # Load Data (only train)
    logging.info(f"\nLoading the partitioned data...")
    train_filename = (io_config["partitions_filenames"])[0]
    dfs_map = load_data_partitions(io_config["partitions_folder"],
                                   [train_filename])
    train = dfs_map["train"].iloc[:, 2:]

    # Feature correlation heat map: All Features
    figure_folder = Path(io_config["figures_fpath"])
    if not figure_folder.exists():
        raise ValueError(
            f"{figure_folder} is not a valid filepath for figures.")
    figure_fpath = str(figure_folder / "corr/feature_correlations_all.pdf")

    # 1) Plot Feature Correlations in the entire Feature Set
    # -- assumes first two columns are the label + user_id
    column_names = train.columns
    feature_labels = column_names
    make_heat_map(train, column_names, feature_labels, figure_fpath)
    logging.info(f"\nSaved heatmap of all features to: {figure_fpath}")

    # 2) Plot Feature Correlations in Reduced Feature Set
    figure_outpath = figure_folder / "corr/feature_correlations_reduced.pdf"
    keep_labels = [labels_dict[col]["label"] for col in keep_features]
    keep_train = train[keep_features]
    make_heat_map(keep_train, keep_features, keep_labels, figure_outpath)
    logging.info(
        f"\nSaved heatmap of reduced feature subset to: {figure_outpath}")
Esempio n. 6
0
def main(config_file="/mnt/configs/io_config.yml"):
    """
    Main function that loads config, sets up logging, and trains a model

    Args:
        config_file (str): path to config file (for logging)

    Returns:
        None
    """
    # Set Parameters
    logger.info(f"Loading config {config_file}")
    config = parse_config(config_file)
    logger.info(f"Config: \n{config}")

    # Load Data
    dfs_map = load_data_partitions(config["partitions_folder"],
                                   config["partitions_filenames"])
    train = dfs_map["train"]
    dev = dfs_map["dev"]

    # Limit Features

    # Fill Missing Data

    # Model Inputs
    X_train = train.iloc[:, 2:]
    y_train = train["bac_clinical"]
    X_dev = dev.iloc[:, 2:]
    y_dev = dev["bac_clinical"]

    logger.info("Training...")
    boosting_params = {
        "silent": False,
        "class_weight": "balanced",
        "n_estimators": 30
    }

    # Train Model
    model = LGBMClassifier(**boosting_params)
    model.fit(X_train,
              y_train,
              eval_set=[(X_dev, y_dev)],
              eval_metric="auc",
              early_stopping_rounds=5)
    print(model.best_iteration_)

    # Save
    model_name = f'lgb_model_{dt.date.today()}'
    model_outpath = f"{config['models_folder']}/{model_name}.joblib"
    joblib.dump(model, model_outpath)

    logger.info(f"Completed. Model saved to docker: {model_outpath}")
Esempio n. 7
0
def main(io_config="/mnt/configs/io.yml",
         feature_config="/mnt/configs/features.yml",
         feature_subset=True):
    """Main function that loads config, sets up logging, and runs predictions

    Args:
        io_config: config for file paths to load or save.
        feature_config: config for feature sets included in various models.
        feature_subset: bool. If true, uses features_to_keep from yml
    """
    # Configure IO Paths
    logger.info(f"Loading config {io_config}")
    config = parse_config(io_config)

    # Load Data for dev set
    data_folder = config["partitions_folder"]
    filenames = config["partitions_filenames"]
    dev_filename = [filenames[1]]

    columns = None
    if feature_subset:
        columns = parse_config(feature_config)["features_to_keep"]
    dfs_map = load_data_partitions(data_folder, dev_filename, columns)
    X_eval, y_eval = split_data(dfs_map['dev'])

    logger.info(f"X_eval has shape = {X_eval.shape}")

    # Fill Missing Data
    [X_eval] = fill_missing_data([X_eval], missing_value=-999)

    # Load Model
    model = joblib.load(config["model_predict_fpath"])
    #print(model.__dict__.keys())

    # Predict
    logger.info("Predicting")
    scores = model.do_predict(X_eval)
Esempio n. 8
0
def main(io_config="/mnt/configs/io.yml",
         features_config="/mnt/configs/features.yml",
         models_config="/mnt/configs/train.yml",
         feature_subset=True):
    """ Main function that loads config, sets up logging, and trains a model

    Args:
        io_config (str, optional): yaml config for paths to load data
        features_config (str, optional): yaml config for features to exclude
        models_config (str, optional): yaml config for LGBM boosting and training params
        feature_subset: load only the reduced set of features_to_keep in features.yml
    """
    # Load IO Paths
    logger.info(f"Loading config {io_config}")
    io_cfg = parse_config(io_config)
    logger.info(f"Config: \n{io_cfg}")

    # Load Feature & Model Configs
    features_cfg = parse_config(features_config)
    models_cfg = parse_config(models_config)

    # Set Model Parameters
    boosting_params = models_cfg["boosting_params"]
    model_folder = io_cfg['models_folder']

    # Load Data
    columns = None
    if feature_subset:
        columns = features_cfg["features_to_keep"]
    datasets = io_cfg["partitions_filenames"][:2]  # train & dev only
    dfs_map = load_data_partitions(io_cfg["partitions_folder"], datasets,
                                   columns)
    X_train, y_train = split_data(dfs_map['train'])
    X_dev, y_dev = split_data(dfs_map['dev'])
    # TODO: Implement get n-unique users from dfs_map partitions
    # -- Add model class method to associate n_users with the model obj

    # Fill Missing Data
    [X_train, X_dev] = fill_missing_data([X_train, X_dev], missing_value=-999)

    # Train Multiple Models, defined by ModelSchemas Class
    ms = ModelSchemas(X_train.columns, features_cfg)

    for model_schema in ms.schemas:
        logging.info(f"\nBuilding model {model_schema['name']}...")

        # Output Filename
        model_fname = f"lgbm_model_{model_schema['name']}"

        # Set X and y
        subset_columns = model_schema['features']
        target = model_schema["target"]
        X = X_train[subset_columns]
        y = y_train

        # Train & Save Models
        fit_params = models_cfg["fit_params"]
        fit_params["eval_set"] = [(X, y), (X_dev[subset_columns], y_dev)]
        fit_params["eval_names"] = ['train', 'dev']

        if target == y_train.name:
            clf = LightGBMModel(**boosting_params)
        elif target == 'majority_class':
            dummy_params = {"strategy": "most_frequent"}  # "stratified"
            clf = DummyModel(**dummy_params)
        else:
            raise ValueError(f"{target} specified in ModelSchemas \
                does not match y_train: {y_train.name}")

        clf.do_fit(X, y, **fit_params)
        clf.save_training_info(X)
        # DummyClassifier doesn't automatically log the aucs
        if target == 'majority_class':
            logging.info(f'Train auc: {clf.get_auc(X, y)}.')
            logging.info(
                f'Eval auc: {clf.get_auc(X_dev[subset_columns], y_dev)}')

        clf.do_save(model_folder, model_fname)
Esempio n. 9
0
def main(io_config="/mnt/configs/io.yml",
         features_config="/mnt/configs/features.yml",
         eval_config='/mnt/configs/evaluate.yml'):
    """Load serialized models and evaluate

    Args:
        io_config: filepaths to load and save data
        features_config: Used to define feature sets for each model
        eval_config: filepath base for saved serialized models
    """

    # Load Configs:  Paths, Features, Saved Model-filepaths
    # TODO: eval_config can be part of io_config, if it doesn't need more args
    logger.info(f"Loading configs...")
    io_cfg = parse_config(io_config)
    features_cfg = parse_config(features_config)
    eval_cfg = parse_config(eval_config)
    models_base_fpath = eval_cfg["models_fpath"]
    figures_fpath = io_cfg["figures_fpath"]

    # Load Data, subset features
    columns = features_cfg["features_to_keep"]
    test_set = io_cfg["partitions_filenames"][2]
    dfs_map = load_data_partitions(io_cfg["partitions_folder"], [test_set],
                                   columns)
    X_test, y_test = split_data(dfs_map['test'])

    # Fill missing data
    [X_test] = fill_missing_data([X_test], missing_value=-999)

    # Loop over models
    # -- Eval multiple trained models
    # -- Build Table 1 summary metrics
    # -- Plot ROC Comparison Figures
    ms = ModelSchemas(X_test.columns, features_cfg)  # Multiple models
    model_metrics = {}  # Table 1
    roc_plot = {}  # ROC Figure

    for model_schema in ms.schemas:
        X, y = get_model_features_target(model_schema, X_test, y_test)

        # Load the model object
        model_name = model_schema["name"]
        model_fpath = models_base_fpath.format(model_name=model_name)
        model = load_model(model_fpath)

        # Evaluate model performance
        y_prob = model.do_predict(X)
        evaluator = ModelEvaluator(y, y_prob)
        model_metrics[model_name] = evaluator.metrics

        # Log Classification Report
        target_names = ['low BAC', 'high BAC']
        evaluator.log_classification_report(target_names)

        # Store for roc-comparison plot
        if model_name != "all_w_majority_class":
            roc_plot[model_name] = {
                "model_label": model_schema["plot_name"],
                "y_true": y,
                "y_prob": y_prob
            }

    # Build & Save Table 1
    model_metrics_df = build_table_1(model_metrics)

    # Plot ROC-comparison plot
    plot_order = model_metrics_df["model_name"].tolist()
    plot_order.remove("all_w_majority_class")
    plot_ROC_comparison(roc_plot, plot_order, figures_fpath, save_plot=True)

    logging.info(f"\nEvaluate.py Complete.")
Esempio n. 10
0
from bac.util.io import (load_data)
from bac.util.config import parse_config

# DATA_FOLDER = '/mnt/data'
# FILENAME = 'bac_2019-10-12.parquet'
# BACTRACK_FILEPATH = '/'.join([DATA_FOLDER, FILENAME])

# df = load_data(BACTRACK_FILEPATH)
# print(df.head(6))

io_config = "/mnt/configs/io_config.yml"
config = parse_config(io_config)
print(config["feature_label_fpath"])