def make_rf(**kwargs): """ Create a random forest model and log it to mlflow """ data_run_id = kwargs["ti"].xcom_pull(task_ids="process_data", key="run_id") client = MlflowClient() path = client.download_artifacts(data_run_id, "processed_data") # Overkill in our case, but imagine they are on different servers, infrastructures df = pd.read_csv(path + "/germany.csv", parse_dates=[0], index_col=0) X = df[["windspeed", "temperature", "rad_horizontal", "rad_diffuse"]] y = df[["solar_GW", "wind_GW"]] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) runs = [] for n_estimators in [4, 25]: for max_depth in [4, 10]: with mlflow.start_run(run_name="rf") as run: model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth) model.fit(X_train, y_train) y_predict = model.predict(X_test) rmse, mae, r2 = eval_metrics(y_test, y_predict) mlflow.log_param("n_estimators", n_estimators) # New mlflow.log_param("max_depth", max_depth) # New mlflow.log_metric("rmse", rmse) # New mlflow.log_metric("mae", mae) # New mlflow.log_metric("r2", r2) # New mlflow.sklearn.log_model(model, "model") # New runs.append(run.info.run_id) kwargs["ti"].xcom_push(key="run_id", value=runs)
def download_artifacts(run_id: str, remote_path: Union[str, Path], local_path: Union[str, Path], client: MlflowClient = MlflowClient(), no_cached=False) -> PathLike: if os.path.exists(local_path) and no_cached: os.remove(Path(local_path, remote_path)) os.makedirs(local_path, exist_ok=True) return client.download_artifacts(run_id=run_id, path=remote_path, dst_path=local_path)
def make_lr(**kwargs): """ Create a linear regression model and log it to mlflow """ data_run_id = kwargs["ti"].xcom_pull(task_ids="process_data", key="run_id") client = MlflowClient() path = client.download_artifacts(data_run_id, "processed_data") # Overkill in our case, but imagine they are on different servers, infrastructures df = pd.read_csv(path + "/germany.csv", parse_dates=[0], index_col=0) X = df[["windspeed", "temperature", "rad_horizontal", "rad_diffuse"]] y = df[["solar_GW", "wind_GW"]] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) with mlflow.start_run(run_name="lr") as run: model = LinearRegression() model.fit(X_train, y_train) y_predict = model.predict(X_test) rmse, mae, r2 = eval_metrics(y_test, y_predict) mlflow.log_metric("rmse", rmse) # New mlflow.log_metric("mae", mae) # New mlflow.log_metric("r2", r2) # New mlflow.sklearn.log_model(model, "model") # New kwargs["ti"].xcom_push(key="run_id", value=[run.info.run_id])