Exemple #1
0
def main(argv):

    input_filename_x = 'train_data.csv'
    input_filename_y = 'train_labels.csv'
    test_input_filename = 'test_data.csv'

    lr_model_filename = 'lr2_classif.pkl'
    model_comp_result_chart_filename = 'method_comp_res.png'

    io = lib.io.IO()
    viz = lib.viz.Viz()
    cl = lib.cl.CL(io, viz)

    # Read data
    X, y = io.read_data(input_filename_x, input_filename_y)
    test_x = io.read_data(test_input_filename, None)
    X_ = copy.deepcopy(X)
    y_ = copy.deepcopy(y)

    print "There are " + str(len(X)) + " samples in the train set."
    print "There are " + str(len(test_x)) + " samples in the test set."

    test_x = np.matrix(test_x)
    test_ids = range(1, len(test_x) + 1)

    cl.lr_cl_load(lr_model_filename)

    # predict
    pred_class, pred_proba = cl.lr_cl_pred(X)

    viz.plot_confusion_matrix(y, pred_class, np.arange(1, 11))
    viz.plot_confusion_matrix(y,
                              pred_class,
                              np.arange(1, 11),
                              normalize=True,
                              filename='confusion_matrix_norm.png')
Exemple #2
0
    if options.load_path is not None:
        model = load_model(options.load_path + '/' + model_filename,
                           options.load_path + '/' + weights_filename)
    # train
    else:
        model, history = train_model(train_x, train_y, val_x, val_y)
        # Save model
        full_model_filename = options.model_path + '/' + model_filename
        full_weights_filename = options.model_path + '/' + weights_filename
        with open(full_model_filename, 'w') as f:
            f.write(model.to_yaml())
            model.save_weights(full_weights_filename)

        # Print metrics
        viz.plot_nn_perf(history, 'nn_perf.png')

    pred_proba = predict(model, X)
    pred_class = np.argmax(pred_proba, axis=1)
    #pred_class = io.shift_v(pred_class, shift=1)
    #pred_class = io.shift_v(y, shift=1)

    viz.plot_confusion_matrix(y,
                              pred_class,
                              np.arange(1, 11),
                              filename='confusion_matrix_nn_4.png')
    viz.plot_confusion_matrix(y,
                              pred_class,
                              np.arange(1, 11),
                              normalize=True,
                              filename='confusion_matrix_nn_4_norm.png')
Exemple #3
0
def classifiers_hyperparam_search(mlflow, config: ConfigParser,
                                  mlflow_url: str, train_params: dict,
                                  mlflow_tags: dict) -> None:
    """
    Trains a series of RandomForest models iterating through all combinations of training parameters as defined in
    train_params, logging the hyperparams and the resulting model and metrics and model to mlflow.
    Uses processed data from location specified in config.

    Arguments:
        mlflow {module} -- the mlflow module (injected as dependency for mocking in integration tests)
        config {ConfigParser} -- required sections:
            "data": containing "dir_processed" and "fname_processed" (together defining the processed data path);
            "artifacts": containing "artifacts_temp";
            "outputs": containing filenames for the outputs ("fname_model" and "fname_conf_matrix");
            "training": containing a boolean field "include_amenities";
            "mlflow": containg "mlflow_experiment";
        mlflow_url {str} -- the URL of the mlflow instance
        train_params {dict} -- A dictionary specifying the grid search hyperparameter values
            (all keys must be kwargs for for sklearn RandomForestClassifier)
        mlflow_tags {dict} -- A dictionary containing tags for the mlflow run (e.g. "git_tag")
    """
    # Unpack config
    dir_processed = Path(config["data"]["dir_processed"])
    fpath_processed_data = dir_processed / config["data"]["fname_processed"]
    dir_artifacts = Path(config["artifacts"]["artifacts_temp"])
    fpath_model = str(dir_artifacts / config["outputs"]["fname_model"])
    fpath_conf_matrix = str(dir_artifacts /
                            config["outputs"]["fname_conf_matrix"])
    include_amenities = bool(config["training"]["include_amenities"])
    mlflow_experiment = config["mlflow"]["mlflow_experiment"]

    # Set up
    dir_artifacts.mkdir(exist_ok=True)
    mlflow.set_tracking_uri(mlflow_url)
    mlflow.set_experiment(mlflow_experiment)

    with mlflow.start_run(run_name="hyperparam_search"):
        mlflow.set_tags(mlflow_tags)

        # Read input data
        df = pd.read_csv(fpath_processed_data, index_col=0).dropna(axis=0)

        # Set up training and testing data
        feature_names = [
            "neighbourhood", "room_type", "accommodates", "bathrooms",
            "bedrooms"
        ]
        amenities = [
            "TV", "Internet", "Air_conditioning", "Kitchen", "Heating", "Wifi",
            "Elevator", "Breakfast"
        ]

        if include_amenities:
            cols = feature_names + amenities
        else:
            cols = feature_names

        X = df[cols]
        y = df["category"]
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.15,
                                                            random_state=1)

        # Iterate through hyperparameter space:
        for params in ParameterGrid(train_params):
            with mlflow.start_run(run_name="train-simple-model", nested=True):
                mlflow.set_tags(mlflow_tags)
                # Train model
                clf = RandomForestClassifier(**params, n_jobs=4)
                clf.fit(X_train, y_train)

                # Save model
                joblib.dump(clf, fpath_model)

                # Evaluate the model
                y_pred = clf.predict(X_test)
                y_proba = clf.predict_proba(X_test)

                metrics = dict()
                metrics["accuracy"] = accuracy_score(y_test, y_pred)
                metrics["roc_auc"] = roc_auc_score(y_test,
                                                   y_proba,
                                                   multi_class="ovr")

                plot_confusion_matrix(y_pred=y_pred,
                                      y_true=y_test,
                                      filepath=fpath_conf_matrix)

                # Log to MLflow
                mlflow.log_params(params)
                mlflow.log_param("amenities", include_amenities)
                mlflow.log_metrics(metrics)
                mlflow.log_artifact(str(fpath_model))
                mlflow.log_artifacts(str(dir_artifacts))
Exemple #4
0
def train_densenet(mlflow, config: ConfigParser, mlflow_url: str,
                   mlflow_tags: dict) -> None:
    """
    The main function of the example Pytorch model training script

    - Loads and prepares breast cancer data for training (as defined in prepare_data.cancer_data)
    - Instantiates the densely connected neural network, optimizer and loss function for model training
    - Trains and validates a neural network (as defined in train_and_validate)
    - Keeps the best version of the model for final evaluation (not necessarily after final epoch)
    - Saves the model, its training and validation metrics and associated validation artifacts in MLflow
    """
    # Unpack config
    mlflow_experiment = config["mlflow"]["mlflow_experiment"]
    random_seed = int(config["training"]["random_seed"])
    batch_size = int(config["training"]["batch_size"])
    n_workers = int(config["training"]["n_workers"])
    epochs = int(config["training"]["epochs"])
    learning_rate = float(config["training"]["lr"])
    dir_processed = config["paths"]["dir_processed"]
    dir_artifacts = Path(config["paths"]["artifacts_temp"])
    filepath_conf_matrix = dir_artifacts / config["filenames"]["fname_conf_mat"]
    filepath_model = dir_artifacts / config["filenames"]["fname_model"]
    filepath_training_history = (dir_artifacts /
                                 config["filenames"]["fname_training_history"])
    filepath_training_history_csv = (
        dir_artifacts / config["filenames"]["fname_training_history_csv"])

    # Prepare before run
    np.random.seed(random_seed)
    torch.manual_seed(random_seed)
    dir_artifacts.mkdir(exist_ok=True)
    mlflow.set_tracking_uri(mlflow_url)
    mlflow.set_experiment(mlflow_experiment)

    with mlflow.start_run(run_name="pytorch_example_train", tags=mlflow_tags):

        # Load the data splits
        train_loader, val_loader, _ = load_data_splits_as_dataloader(
            dir_processed=dir_processed,
            batch_size=batch_size,
            n_workers=n_workers)

        # Instantiate the Dense NN, loss function and optimizer
        net = DenseNN()
        loss_fn = nn.BCELoss()
        optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

        # Train and validate
        net, df_history, _ = train_and_validate(
            model=net,
            loss_fn=loss_fn,
            optimizer=optimizer,
            train_loader=train_loader,
            val_loader=val_loader,
            epochs=epochs,
            filepath_model=filepath_model,
        )

        # Load best version
        net = DenseNN()
        net.load_state_dict(torch.load(filepath_model))

        # Get metrics on best model
        train_loss, train_acc, _ = val_loop(dataloader=train_loader,
                                            model=net,
                                            loss_fn=loss_fn)
        val_loss, val_acc, (y_val_true,
                            y_val_pred) = val_loop(dataloader=val_loader,
                                                   model=net,
                                                   loss_fn=loss_fn)
        cm = confusion_matrix(y_val_true, y_val_pred)

        # Save artifacts
        plot_confusion_matrix(
            cm,
            normalize=False,
            title="Confusion matrix (validation set)",
            savepath=filepath_conf_matrix,
        )
        plot_training_history(df_history,
                              title="Training history",
                              savepath=filepath_training_history)
        df_history.to_csv(filepath_training_history_csv)

        # Log to MLflow
        mlflow.log_artifacts(dir_artifacts)
        mlflow.log_metrics(
            dict(
                val_loss=val_loss,
                val_acc=val_acc,
                train_loss=train_loss,
                train_acc=train_acc,
            ))
        mlflow.log_params(
            dict(
                epochs=epochs,
                batch_size=batch_size,
                learning_rate=learning_rate,
                classifier="DenseNN",
            ))

        print("Done!")
Exemple #5
0
def train_classifiers(
    mlflow: Union[ModuleType, MagicMock],
    config: Union[ConfigParser, dict],
    mlflow_url: str,
    mlflow_tags: dict,
) -> None:
    """
    Trains a number of classifiers on the data that is found in the directory specified as dir_processed in config.

    Arguments:
        mlflow {Union[ModuleType, MagicMock]} --  MLflow module or its mock replacement
        config {Union[ConfigParser, dict]} -- configuration for the training, with the required sections:
            - "training": containing "random_seed";
            - "paths": containing "artifacts_temp" and "dir_processed";
            - "mlflow": containing "mlflow_experiment"
        mlflow_url {str} -- MLflow URL (empty if replacing mlflow with a mock)
        mlflow_tags {dict} -- MLflow tags (empty if replacing mlflow with a mock)
    """
    # Unpack config:
    random_seed = int(config["training"]["random_seed"])
    dir_processed = config["paths"]["dir_processed"]
    dir_artifacts = Path(config["paths"]["artifacts_temp"])
    filepath_conf_matrix = dir_artifacts / "confusion_matrix.png"
    mlflow_experiment = config["mlflow"]["mlflow_experiment"]

    # Prepare before run
    np.random.seed(random_seed)
    dir_artifacts.mkdir(exist_ok=True)
    mlflow.set_tracking_uri(mlflow_url)
    mlflow.set_experiment(mlflow_experiment)

    with mlflow.start_run(run_name="sklearn_example_train", tags=mlflow_tags):

        # Load training and validation data
        X_train, X_val, _, y_train, y_val, _ = load_data_splits(
            dir_processed=dir_processed, as_type="array")

        # Define a number of classifiers
        models = create_classifiers()

        # Iterate fitting and validation through all model types, logging results to MLflow:
        for model_name, model in models.items():

            with mlflow.start_run(run_name=model_name,
                                  nested=True,
                                  tags=mlflow_tags):

                model.fit(X_train, y_train)
                y_pred = model.predict(X_val)
                val_accuracy = accuracy_score(y_pred, y_val)
                cm = confusion_matrix(y_val, y_pred)
                plot_confusion_matrix(
                    cm,
                    normalize=False,
                    title="Confusion matrix (validation set)",
                    savepath=filepath_conf_matrix,
                )

                mlflow.log_artifacts(dir_artifacts)
                mlflow.log_params({"classifier": model_name})
                mlflow.log_metrics({"val_acc": val_accuracy})