class LendingClubTrainingPipeline():
    def __init__(self, spark, input_path, model_name, limit=None):
        self.spark = spark
        self.input_path = input_path
        self.model_name = model_name
        self.limit = limit
        self.data_provider = LendingClubDataProvider(spark, input_path, limit)

    def run(self):
        X_train, X_test, Y_train, Y_test = self.data_provider.run()
        self.train(X_train, X_test, Y_train, Y_test)

    def train(self, X_train, X_test, Y_train, Y_test):
        cl = LogisticRegression(random_state=42, max_iter=100)
        # cl = RandomForestClassifier(random_state=42)
        cl.fit(X_train, Y_train)
        with mlflow.start_run(run_name="Training") as run:
            self.eval_and_log_metrics(cl, X_test, Y_test)
            mlflow.sklearn.log_model(cl, "model")

    def eval_and_log_metrics(self, estimator, X, Y):
        predictions = estimator.predict(X)

        # Calc metrics
        acc = accuracy_score(Y, predictions)
        roc = roc_auc_score(Y, predictions)
        mse = mean_squared_error(Y, predictions)
        mae = mean_absolute_error(Y, predictions)
        r2 = r2_score(Y, predictions)

        # Print metrics
        print("  acc: {}".format(acc))
        print("  roc: {}".format(roc))
        print("  mse: {}".format(mse))
        print("  mae: {}".format(mae))
        print("  R2: {}".format(r2))

        # Log metrics
        mlflow.log_metric("acc", acc)
        mlflow.log_metric("roc", roc)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)

        mlflow.set_tag('candidate', 'true')
Example #2
0
class LendingClubModelEvaluationPipeline():
    def __init__(self,
                 spark,
                 experimentID,
                 model_name,
                 input_path,
                 limit=None):
        self.spark = spark
        self.input_path = input_path
        self.model_name = model_name
        self.limit = limit
        self.experimentID = experimentID
        self.data_provider = LendingClubDataProvider(spark, input_path, limit)

    def run(self):
        mlflow_client = MlflowClient()

        _, X_test, _, Y_test = self.data_provider.run()
        cand_run_ids = self.get_candidate_models()
        best_cand_roc, best_cand_run_id = self.get_best_model(
            cand_run_ids, X_test, Y_test)
        print('Best ROC (candidate models): ', best_cand_roc)

        try:
            versions = mlflow_client.get_latest_versions(self.model_name,
                                                         stages=['Production'])
            prod_run_ids = [v.run_id for v in versions]
            best_prod_roc, best_prod_run_id = self.get_best_model(
                prod_run_ids, X_test, Y_test)
        except RestException:
            best_prod_roc = -1
        print('ROC (production models): ', best_prod_roc)

        if best_cand_roc >= best_prod_roc:
            # deploy new model
            model_version = mlflow.register_model(
                "runs:/" + best_cand_run_id + "/model", self.model_name)
            time.sleep(5)
            mlflow_client.transition_model_version_stage(
                name=self.model_name,
                version=model_version.version,
                stage="Production")
            print('Deployed version: ', model_version.version)
        # remove candidate tags
        for run_id in cand_run_ids:
            mlflow_client.set_tag(run_id, 'candidate', 'false')

    def get_best_model(self, run_ids, X, Y):
        best_roc = -1
        best_run_id = None
        for run_id in run_ids:
            roc = self.evaluate_model(run_id, X, Y)
            if roc > best_roc:
                best_roc = roc
                best_run_id = run_id
        return best_roc, best_run_id

    def get_candidate_models(self):
        spark_df = self.spark.read.format("mlflow-experiment").load(
            self.experimentID)
        pdf = spark_df.where("tags.candidate='true'").select(
            "run_id").toPandas()
        return pdf['run_id'].values

    def evaluate_model(self, run_id, X, Y):
        model = mlflow.sklearn.load_model('runs:/{}/model'.format(run_id))
        predictions = model.predict(X)
        # acc = accuracy_score(Y, predictions)
        roc = roc_auc_score(Y, predictions)
        return roc