def main(argv): input_filename_x = 'train_data.csv' input_filename_y = 'train_labels.csv' test_input_filename = 'test_data.csv' lr_model_filename = 'lr2_classif.pkl' model_comp_result_chart_filename = 'method_comp_res.png' io = lib.io.IO() viz = lib.viz.Viz() cl = lib.cl.CL(io, viz) # Read data X, y = io.read_data(input_filename_x, input_filename_y) test_x = io.read_data(test_input_filename, None) X_ = copy.deepcopy(X) y_ = copy.deepcopy(y) print "There are " + str(len(X)) + " samples in the train set." print "There are " + str(len(test_x)) + " samples in the test set." test_x = np.matrix(test_x) test_ids = range(1, len(test_x) + 1) cl.lr_cl_load(lr_model_filename) # predict pred_class, pred_proba = cl.lr_cl_pred(X) viz.plot_confusion_matrix(y, pred_class, np.arange(1, 11)) viz.plot_confusion_matrix(y, pred_class, np.arange(1, 11), normalize=True, filename='confusion_matrix_norm.png')
if options.load_path is not None: model = load_model(options.load_path + '/' + model_filename, options.load_path + '/' + weights_filename) # train else: model, history = train_model(train_x, train_y, val_x, val_y) # Save model full_model_filename = options.model_path + '/' + model_filename full_weights_filename = options.model_path + '/' + weights_filename with open(full_model_filename, 'w') as f: f.write(model.to_yaml()) model.save_weights(full_weights_filename) # Print metrics viz.plot_nn_perf(history, 'nn_perf.png') pred_proba = predict(model, X) pred_class = np.argmax(pred_proba, axis=1) #pred_class = io.shift_v(pred_class, shift=1) #pred_class = io.shift_v(y, shift=1) viz.plot_confusion_matrix(y, pred_class, np.arange(1, 11), filename='confusion_matrix_nn_4.png') viz.plot_confusion_matrix(y, pred_class, np.arange(1, 11), normalize=True, filename='confusion_matrix_nn_4_norm.png')
def classifiers_hyperparam_search(mlflow, config: ConfigParser, mlflow_url: str, train_params: dict, mlflow_tags: dict) -> None: """ Trains a series of RandomForest models iterating through all combinations of training parameters as defined in train_params, logging the hyperparams and the resulting model and metrics and model to mlflow. Uses processed data from location specified in config. Arguments: mlflow {module} -- the mlflow module (injected as dependency for mocking in integration tests) config {ConfigParser} -- required sections: "data": containing "dir_processed" and "fname_processed" (together defining the processed data path); "artifacts": containing "artifacts_temp"; "outputs": containing filenames for the outputs ("fname_model" and "fname_conf_matrix"); "training": containing a boolean field "include_amenities"; "mlflow": containg "mlflow_experiment"; mlflow_url {str} -- the URL of the mlflow instance train_params {dict} -- A dictionary specifying the grid search hyperparameter values (all keys must be kwargs for for sklearn RandomForestClassifier) mlflow_tags {dict} -- A dictionary containing tags for the mlflow run (e.g. "git_tag") """ # Unpack config dir_processed = Path(config["data"]["dir_processed"]) fpath_processed_data = dir_processed / config["data"]["fname_processed"] dir_artifacts = Path(config["artifacts"]["artifacts_temp"]) fpath_model = str(dir_artifacts / config["outputs"]["fname_model"]) fpath_conf_matrix = str(dir_artifacts / config["outputs"]["fname_conf_matrix"]) include_amenities = bool(config["training"]["include_amenities"]) mlflow_experiment = config["mlflow"]["mlflow_experiment"] # Set up dir_artifacts.mkdir(exist_ok=True) mlflow.set_tracking_uri(mlflow_url) mlflow.set_experiment(mlflow_experiment) with mlflow.start_run(run_name="hyperparam_search"): mlflow.set_tags(mlflow_tags) # Read input data df = pd.read_csv(fpath_processed_data, index_col=0).dropna(axis=0) # Set up training and testing data feature_names = [ "neighbourhood", "room_type", "accommodates", "bathrooms", "bedrooms" ] amenities = [ "TV", "Internet", "Air_conditioning", "Kitchen", "Heating", "Wifi", "Elevator", "Breakfast" ] if include_amenities: cols = feature_names + amenities else: cols = feature_names X = df[cols] y = df["category"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1) # Iterate through hyperparameter space: for params in ParameterGrid(train_params): with mlflow.start_run(run_name="train-simple-model", nested=True): mlflow.set_tags(mlflow_tags) # Train model clf = RandomForestClassifier(**params, n_jobs=4) clf.fit(X_train, y_train) # Save model joblib.dump(clf, fpath_model) # Evaluate the model y_pred = clf.predict(X_test) y_proba = clf.predict_proba(X_test) metrics = dict() metrics["accuracy"] = accuracy_score(y_test, y_pred) metrics["roc_auc"] = roc_auc_score(y_test, y_proba, multi_class="ovr") plot_confusion_matrix(y_pred=y_pred, y_true=y_test, filepath=fpath_conf_matrix) # Log to MLflow mlflow.log_params(params) mlflow.log_param("amenities", include_amenities) mlflow.log_metrics(metrics) mlflow.log_artifact(str(fpath_model)) mlflow.log_artifacts(str(dir_artifacts))
def train_densenet(mlflow, config: ConfigParser, mlflow_url: str, mlflow_tags: dict) -> None: """ The main function of the example Pytorch model training script - Loads and prepares breast cancer data for training (as defined in prepare_data.cancer_data) - Instantiates the densely connected neural network, optimizer and loss function for model training - Trains and validates a neural network (as defined in train_and_validate) - Keeps the best version of the model for final evaluation (not necessarily after final epoch) - Saves the model, its training and validation metrics and associated validation artifacts in MLflow """ # Unpack config mlflow_experiment = config["mlflow"]["mlflow_experiment"] random_seed = int(config["training"]["random_seed"]) batch_size = int(config["training"]["batch_size"]) n_workers = int(config["training"]["n_workers"]) epochs = int(config["training"]["epochs"]) learning_rate = float(config["training"]["lr"]) dir_processed = config["paths"]["dir_processed"] dir_artifacts = Path(config["paths"]["artifacts_temp"]) filepath_conf_matrix = dir_artifacts / config["filenames"]["fname_conf_mat"] filepath_model = dir_artifacts / config["filenames"]["fname_model"] filepath_training_history = (dir_artifacts / config["filenames"]["fname_training_history"]) filepath_training_history_csv = ( dir_artifacts / config["filenames"]["fname_training_history_csv"]) # Prepare before run np.random.seed(random_seed) torch.manual_seed(random_seed) dir_artifacts.mkdir(exist_ok=True) mlflow.set_tracking_uri(mlflow_url) mlflow.set_experiment(mlflow_experiment) with mlflow.start_run(run_name="pytorch_example_train", tags=mlflow_tags): # Load the data splits train_loader, val_loader, _ = load_data_splits_as_dataloader( dir_processed=dir_processed, batch_size=batch_size, n_workers=n_workers) # Instantiate the Dense NN, loss function and optimizer net = DenseNN() loss_fn = nn.BCELoss() optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate) # Train and validate net, df_history, _ = train_and_validate( model=net, loss_fn=loss_fn, optimizer=optimizer, train_loader=train_loader, val_loader=val_loader, epochs=epochs, filepath_model=filepath_model, ) # Load best version net = DenseNN() net.load_state_dict(torch.load(filepath_model)) # Get metrics on best model train_loss, train_acc, _ = val_loop(dataloader=train_loader, model=net, loss_fn=loss_fn) val_loss, val_acc, (y_val_true, y_val_pred) = val_loop(dataloader=val_loader, model=net, loss_fn=loss_fn) cm = confusion_matrix(y_val_true, y_val_pred) # Save artifacts plot_confusion_matrix( cm, normalize=False, title="Confusion matrix (validation set)", savepath=filepath_conf_matrix, ) plot_training_history(df_history, title="Training history", savepath=filepath_training_history) df_history.to_csv(filepath_training_history_csv) # Log to MLflow mlflow.log_artifacts(dir_artifacts) mlflow.log_metrics( dict( val_loss=val_loss, val_acc=val_acc, train_loss=train_loss, train_acc=train_acc, )) mlflow.log_params( dict( epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, classifier="DenseNN", )) print("Done!")
def train_classifiers( mlflow: Union[ModuleType, MagicMock], config: Union[ConfigParser, dict], mlflow_url: str, mlflow_tags: dict, ) -> None: """ Trains a number of classifiers on the data that is found in the directory specified as dir_processed in config. Arguments: mlflow {Union[ModuleType, MagicMock]} -- MLflow module or its mock replacement config {Union[ConfigParser, dict]} -- configuration for the training, with the required sections: - "training": containing "random_seed"; - "paths": containing "artifacts_temp" and "dir_processed"; - "mlflow": containing "mlflow_experiment" mlflow_url {str} -- MLflow URL (empty if replacing mlflow with a mock) mlflow_tags {dict} -- MLflow tags (empty if replacing mlflow with a mock) """ # Unpack config: random_seed = int(config["training"]["random_seed"]) dir_processed = config["paths"]["dir_processed"] dir_artifacts = Path(config["paths"]["artifacts_temp"]) filepath_conf_matrix = dir_artifacts / "confusion_matrix.png" mlflow_experiment = config["mlflow"]["mlflow_experiment"] # Prepare before run np.random.seed(random_seed) dir_artifacts.mkdir(exist_ok=True) mlflow.set_tracking_uri(mlflow_url) mlflow.set_experiment(mlflow_experiment) with mlflow.start_run(run_name="sklearn_example_train", tags=mlflow_tags): # Load training and validation data X_train, X_val, _, y_train, y_val, _ = load_data_splits( dir_processed=dir_processed, as_type="array") # Define a number of classifiers models = create_classifiers() # Iterate fitting and validation through all model types, logging results to MLflow: for model_name, model in models.items(): with mlflow.start_run(run_name=model_name, nested=True, tags=mlflow_tags): model.fit(X_train, y_train) y_pred = model.predict(X_val) val_accuracy = accuracy_score(y_pred, y_val) cm = confusion_matrix(y_val, y_pred) plot_confusion_matrix( cm, normalize=False, title="Confusion matrix (validation set)", savepath=filepath_conf_matrix, ) mlflow.log_artifacts(dir_artifacts) mlflow.log_params({"classifier": model_name}) mlflow.log_metrics({"val_acc": val_accuracy})