def _load(self) -> Any: # pragma: no cover if self.run_id: # if no run_id is specified, we take the artifact from the local path rather that the active run: # there are a lot of chances that it has not been saved yet! mlflow_client = MlflowClient() if hasattr(self, "_version"): # all kedro datasets inherits from AbstractVersionedDataSet local_path = self._get_load_path() elif hasattr(self, "_filepath"): # in case custom datasets inherits from AbstractDataSet without versioning local_path = self._filepath # pragma: no cover elif hasattr(self, "_path"): # special datasets with a folder instead of a specifi files like PartitionedDataSet local_path = Path(self._path) artifact_path = ((self.artifact_path / local_path.name).as_posix() if self.artifact_path else local_path.name) mlflow_client.download_artifacts( run_id=self.run_id, path=artifact_path, dst_path=local_path.parent.as_posix( ), # must be a **local** **directory** ) # finally, read locally return super()._load()
def test_invalid_load(): client = MlflowClient(tracking_uri=str(TEST_MLRUNS_PATH)) with tempfile.TemporaryDirectory() as td: # should crash try: client.download_artifacts("6e6280f331a94bf388fa9d0de0ecee99", "model/model.pkl", td) crashed = False except: crashed = True assert crashed
def test_fixed_load(): # first fix in tempdir with tempfile.TemporaryDirectory() as tmpdir: wd = Path(tmpdir) # copy folder from tests to tmp as testruns (changed from mlruns) path_to_store = wd.joinpath("testruns") shutil.copytree(TEST_MLRUNS_PATH, path_to_store) # provide mlruns here because this is in metadata, current folder name should be autodetected assert fix.fix_meta(path_to_store, "mlruns") # now try to read client = MlflowClient(tracking_uri=str(path_to_store)) # should run without error (file must be found) client.download_artifacts("6e6280f331a94bf388fa9d0de0ecee99", "model/model.pkl", tmpdir)
def download(model_name: str, out_path: str): import json client = MlflowClient(tracking_uri="databricks") for rm in client.list_registered_models(): if rm.name == model_name: for version in rm.latest_versions: if version.current_stage == "Staging": print(f"Downloading verion {version.version} of {model_name} with run_id {version.run_id}...") client.download_artifacts(version.run_id, "model/data/model.pt", '.') meta = { "version": version.version, "run_id": version.run_id, "name": model_name } with open("model_meta.json", "w") as m: json.dump(meta, m)
def main(): parser = argparse.ArgumentParser( description="Execute python scripts in Databricks") parser.add_argument("-o", "--output_local_path", help="Output path where the artifacts will be written", required=True) parser.add_argument("-m", "--model_name", help="Model Registry Name", required=True) args = parser.parse_args() model_name = args.model_name output_local_path = args.output_local_path cli_profile_name = "registry" # TODO: Document that we assume that the registry profile will be created in the local machine: # dbutils.fs.put(f"file:///root/.databrickscfg", f"[{cli_profile_name}]\nhost={shard}\ntoken={token}", # overwrite=True) TRACKING_URI = f"databricks://{cli_profile_name}" print(f"TRACKING_URI: {TRACKING_URI}") artifact_path = 'model' from mlflow.tracking import MlflowClient remote_client = MlflowClient(tracking_uri=TRACKING_URI) mlflow.set_tracking_uri(TRACKING_URI) # client = mlflow.tracking.MlflowClient() latest_model = remote_client.get_latest_versions(name=model_name, stages=["staging"]) print(f"Latest Model: {latest_model}") run_id = latest_model[0].run_id artifact_uri = artifact_utils.get_artifact_uri(run_id) print(f"artifact_uri: {artifact_uri}") model_uri = f"runs:/{latest_model[0].run_id}/{artifact_path}" print(f"model_uri: {model_uri}") print(f"Downloading model artifacts to : {output_local_path}") remote_client.download_artifacts(run_id=run_id, path=artifact_path, dst_path=output_local_path)
def test_is_versioned_dataset_logged_correctly_in_mlflow( tmp_path, tracking_uri, df1): """Check if versioned dataset is logged correctly in MLflow as artifact. For versioned datasets just artifacts from current run should be logged. """ mlflow.set_tracking_uri(tracking_uri.as_uri()) mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri()) mlflow.start_run() run_id = mlflow.active_run().info.run_id active_run_id = mlflow.active_run().info.run_id mlflow_csv_dataset = MlflowArtifactDataSet( data_set=dict(type=CSVDataSet, filepath=(tmp_path / "df1.csv").as_posix(), versioned=True), run_id=run_id, ) mlflow_csv_dataset.save(df1) run_artifacts = [ fileinfo.path for fileinfo in mlflow_client.list_artifacts(run_id=run_id) ] # Check if just one artifact was created in given run. assert len(run_artifacts) == 1 artifact_path = mlflow_client.download_artifacts(run_id=run_id, path=run_artifacts[0]) # Check if saved artifact is file and not folder where versioned datasets are stored. assert Path(artifact_path).is_file() assert (mlflow.active_run().info.run_id == active_run_id if mlflow.active_run() else True ) # if a run was opened before saving, it must be reopened assert df1.equals(mlflow_csv_dataset.load()) # and must loadable mlflow.end_run()
from src.models.predict import predict_file, predict_smavra import pandas as pd import plotly.graph_objects as go from plotly.subplots import make_subplots RUN_ID = "4d8ddb41e7f340c182a6a62699502d9f" DEVICE = "cuda" TRAIN_DATA_PATH = "data/processed/resmed/train/train.pt" mlflow_client = MlflowClient() # get processing info with tempfile.TemporaryDirectory() as tmp_dir: with open( mlflow_client.download_artifacts( run_id=RUN_ID, path="config/preprocessing_config.json", dst_path=tmp_dir), "r") as f: preprocessing_config = json.load(f) # load model smavra = mlflow.pytorch.load_model('runs:/' + RUN_ID + '/model', map_location="cuda:0") smavra.eval() if DEVICE == "cuda": smavra.cuda() else: smavra.cpu() pred_tensor = torch.load(TRAIN_DATA_PATH)
import os import mlflow from mlflow.tracking import MlflowClient if __name__ == "__main__": features = "rooms, zipcode, median_price, school_rating, transport" with open("features.txt", 'w') as f: f.write(features) # Log artifacts with mlflow.start_run() as run: mlflow.log_artifact("features.txt", artifact_path="features") client = MlflowClient() local_dir = "/tmp/artifact_downloads" if not os.path.exists(local_dir): os.mkdir(local_dir) local_path = client.download_artifacts(run.info.run_id, "features", local_dir) print("Artifacts downloaded at : {}".format(local_path)) print("Artifacts: {}".format(os.listdir(local_path)))
def epoch_attention( run_id: str, session: str, epoch_nr: int, input_dir: str = "data/processed/resmed/score", preprocessing_config: str = "config/preprocessing_config.json", seq_len: int = 750, device: str = "cuda"): """predict using trained smavra network Args: run_id (str): run_id from mlflow experiment input_dir (str, optional): input directory holding the data to be predicted. Defaults to "data/processed/resmed/score". output_dir (str, optional): the directory the predictions should be written to. Defaults to "data/scored/resmed". preprocessing_config (str, optional): location of preprocessing config in mlflow. Defaults to "config/preprocessing_config.json". seq_len (int, optional): sequence lengths -> workaround. Defaults to 750. device (str, optional): device to run inference on. """ logger = logging.getLogger(__name__) logger.info(f"Reading file {session}.") # clean up output_dir session_file = Path(os.path.join(input_dir, f"{session}_0_HRD.edf.parquet")) if not session_file.exists(): logger.error(f"file {str(session_file)} does not exist.") attention_path = Path(os.path.join("reports/figures", run_id, "attention")) # remove directories if they exist if not attention_path.exists(): attention_path.mkdir(parents=True, exist_ok=True) logger.info("Setting up model.") mlflow_client = MlflowClient() # get processing info with tempfile.TemporaryDirectory() as tmp_dir: with open( mlflow_client.download_artifacts( run_id=run_id, path="config/preprocessing_config.json", dst_path=tmp_dir), "r") as f: preprocessing_config = json.load(f) # load model smavra = mlflow.pytorch.load_model('runs:/' + run_id + '/model', map_location="cuda:0") smavra.eval() if device == "cuda": smavra.cuda() else: smavra.cpu() logger.info("Getting attention.") pred_df = pq.read_table(session_file).to_pandas() pred_tensor = torch.Tensor(pred_df.iloc[:, :3].values) pred_tensor = reshape_resmed_tensor(pred_tensor, seq_len) dataset = ResmedDatasetEpoch( data=pred_tensor, batch_size=1, device=device, means=torch.Tensor(preprocessing_config["means"]), stds=torch.Tensor(preprocessing_config["stds"])) epoch = dataset[epoch_nr].unsqueeze(0).to(device) # get prediction h_t, latent, attention_weight, attention, lengths = \ smavra.encode(epoch) _, n_heads, _, _ = attention_weight.shape attention_weights = { j: attention_weight[0, j, :, :].cpu().detach().numpy() for j in range(n_heads) } return (attention_weights)
def predict_smavra( run_id: str, input_dir: str = "data/processed/resmed/score", output_dir: str = "data/scored/resmed", preprocessing_config: str = "config/preprocessing_config.json", seq_len: int = 750, device: str = "cuda", explain_latent: bool = True, explain_attention: bool = False, score_file_pattern: str = "*"): """predict using trained smavra network Args: run_id (str): run_id from mlflow experiment input_dir (str, optional): input directory holding the data to be predicted. Defaults to "data/processed/resmed/score". output_dir (str, optional): the directory the predictions should be written to. Defaults to "data/scored/resmed". preprocessing_config (str, optional): location of preprocessing config in mlflow. Defaults to "config/preprocessing_config.json". seq_len (int, optional): sequence lengths -> workaround. Defaults to 750. device (str, optional): device to run inference on. """ column_order = ["mask_press", "resp_flow", "delivered_volum"] logger = logging.getLogger(__name__) logger.info("Preparing directory structure.") # clean up output_dir output_score_dir = Path(os.path.join(output_dir, "score", run_id)) output_explain_latent_dir = Path( os.path.join(output_dir, "explain", "latent", run_id)) output_explain_attention_dir = Path( os.path.join(output_dir, "explain", "attention", run_id)) # remove directories if they exist if output_score_dir.exists(): shutil.rmtree(output_score_dir) output_score_dir.mkdir(parents=True, exist_ok=True) # remove directories if they exist if output_explain_latent_dir.exists(): shutil.rmtree(output_explain_latent_dir) output_explain_latent_dir.mkdir(parents=True, exist_ok=True) # remove directories if they exist if output_explain_attention_dir.exists(): shutil.rmtree(output_explain_attention_dir) output_explain_attention_dir.mkdir(parents=True, exist_ok=True) logger.info("Setting up model.") mlflow_client = MlflowClient() # get processing info with tempfile.TemporaryDirectory() as tmp_dir: with open( mlflow_client.download_artifacts( run_id=run_id, path="config/preprocessing_config.json", dst_path=tmp_dir), "r") as f: preprocessing_config = json.load(f) # load model smavra = mlflow.pytorch.load_model('runs:/' + run_id + '/model', map_location="cuda:0") smavra.eval() if device == "cuda": smavra.cuda() else: smavra.cpu() logger.info("Start with predicition.") for score_file_path in Path(input_dir).glob(score_file_pattern): pred_df = pq.read_table(os.path.join(score_file_path)).to_pandas() pred_tensor = torch.Tensor(pred_df.iloc[:, :3].values) pred_tensor = reshape_resmed_tensor(pred_tensor, seq_len) score_dataset = ResmedDatasetEpoch( data=pred_tensor, batch_size=1, device=device, means=torch.Tensor(preprocessing_config["means"]), stds=torch.Tensor(preprocessing_config["stds"])) preds, latents, attention_weights = predict_file( model=smavra, dataset=score_dataset, file_path=score_file_path, explain_latent=explain_latent, explain_attention=explain_attention, seq_len=seq_len, column_order=column_order) if len(preds) > 1: preds = pd.concat(preds, ignore_index=True) else: preds = preds[0] preds = pd.concat([pred_df, preds], axis=1) # write predicitons table = pa.Table.from_pandas(preds) file_name = os.path.basename(score_file_path) pq.write_table(table, os.path.join(output_score_dir, file_name)) # EXPLAINABILITY # write latent if explain_latent: latents = np.stack(latents, 0) latent_cols = [f"latent_{i}" for i in range(latents.shape[1] - 3)] df = pd.DataFrame(latents, columns=latent_cols + ["epoch_loss", "epoch", "epoch_class"]) df["file_name"] = os.path.basename(score_file_path)[:15] table = pa.Table.from_pandas(df) file_name = os.path.join(output_explain_latent_dir, os.path.basename(score_file_path)) pq.write_table(table, file_name) # write attention if explain_attention: with open( os.path.join(output_explain_attention_dir, os.path.basename(score_file_path) + ".pkl"), "wb") as f: pk.dump(attention_weights, f)
os.system('nvidia-smi') minio_client = minio.Minio('minio:9000', access_key='admin', secret_key='password', secure=False) from mlflow.tracking import MlflowClient mlflow_client = MlflowClient() configs_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'configs') mlflow_client.download_artifacts(run_id, 'configs', dst_path=os.path.dirname( os.path.abspath(__file__))) print(os.listdir('configs')) train = None with open(os.path.join(configs_path, 'train.json'), 'r') as f: train = json.load(f) if train is None: exit(-1) test = None with open(os.path.join(configs_path, 'test.json'), 'r') as f: test = json.load(f)
import os import uuid import shutil import pandas as pd from mlflow.tracking import MlflowClient # Create temp directory to download input data from MLflow input_temp_dir = os.path.join(os.environ["SPARK_LOCAL_DIRS"], str(uuid.uuid4())[:8]) os.makedirs(input_temp_dir) # Download the artifact and read it into a pandas DataFrame input_client = MlflowClient() input_data_path = input_client.download_artifacts( "0df1633dc8eb4dc0ac25655e9c851687", "data", input_temp_dir) df_loaded = pd.read_parquet(os.path.join(input_data_path, "training_data")) # Delete the temp data shutil.rmtree(input_temp_dir) # Preview data df_loaded.head(5) # COMMAND ---------- # MAGIC %md # MAGIC ## Preprocessors # COMMAND ----------
class CustomerMlflowClient: def __init__(self, tracking_server_uri, experiment_name): try: self.mlflow_client = MlflowClient(tracking_server_uri) logger.info("established mlflow rest-api client") except Exception as e: logger.error(str(e)) try: self.experiment_id = self.set_experiment(experiment_name) logger.info("started mlflow experiment {} with id {}".format( experiment_name, self.experiment_id)) except Exception as e: logger.error(str(e)) def logger(self, params, metrics, local_artifact_path, mlflow_artifact_path=None): run = self.mlflow_client.create_run(self.experiment_id) run_id = run.info.run_id logger.info("staring new run with id: {}".format(run_id)) logger.info("logging parameter to mlflow tracking server") self.log_params(run_id, params) logger.info("successfully logged parameter to mlflow tracking server") logger.info("logging model metrics to mlflow tracking server") self.log_metrics(run_id, metrics) logger.info( "successfully logged model metrics to mlflow tracking server") logger.info("logging model artifact to mlflow tracking server") self.log_artifact(run_id, local_artifact_path) logger.info( "successfully logged model artifact to mlflow tracking server") logger.info("exiting run with id: {}".format(run_id)) def set_experiment(self, experiment_name): experiment = self.mlflow_client.get_experiment_by_name(experiment_name) if experiment is None: return self.mlflow_client.create_experiment(experiment_name) else: return experiment.experiment_id def log_params(self, run_id: int, params): for key, value in params.items(): self.mlflow_client.log_param(run_id=run_id, key=key, value=value) def log_metrics(self, run_id: int, metrics): for key, value in metrics.items(): self.mlflow_client.log_metric(run_id=run_id, key=key, value=value) def log_artifact(self, run_id: int, artifact): self.mlflow_client.log_artifact(run_id=run_id, local_path=artifact) def get_latest_artifact(self, dest_path): run_info = self.mlflow_client.list_run_infos(self.experiment_id) latest_run_info = run_info[0] file_name = self.mlflow_client.list_artifacts( run_id=latest_run_info.run_id)[0].path complete_artifact_path = latest_run_info.artifact_uri + '/' + file_name self.mlflow_client.download_artifacts(run_id=latest_run_info.run_id, path=complete_artifact_path, dst_path=dest_path) return dest_path + file_name
def test_mlflow_callback(tmpdir): epochs = 2 batch_size = 8 num_examples = 32 input_features = [sequence_feature(reduce_output='sum')] output_features = [category_feature(vocab_size=2, reduce_input='sum')] config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'training': { 'epochs': epochs, 'batch_size': batch_size }, } data_csv = generate_data(input_features, output_features, os.path.join(tmpdir, 'train.csv'), num_examples=num_examples) val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, 'validation.csv')) test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, 'test.csv')) mlflow_uri = f'file://{tmpdir}/mlruns' mlflow.set_tracking_uri(mlflow_uri) client = MlflowClient(tracking_uri=mlflow_uri) exp_name = 'mlflow_test' callback = MlflowCallback() model = LudwigModel(config, callbacks=[callback]) model.train(training_set=data_csv, validation_set=val_csv, test_set=test_csv, experiment_name=exp_name) expected_df, _ = model.predict(test_csv) # Check mlflow artifacts assert callback.experiment_id is not None assert callback.run is not None experiment = mlflow.get_experiment_by_name(exp_name) assert experiment.experiment_id == callback.experiment_id df = mlflow.search_runs([experiment.experiment_id]) assert len(df) == 1 run_id = df.run_id[0] assert run_id == callback.run.info.run_id artifacts = [ f.path for f in client.list_artifacts(callback.run.info.run_id, "") ] local_dir = f'{tmpdir}/local_artifacts' os.makedirs(local_dir) assert 'config.yaml' in artifacts local_config_path = client.download_artifacts(callback.run.info.run_id, "config.yaml", local_dir) with open(local_config_path, 'r') as f: config_artifact = yaml.safe_load(f) assert config_artifact == config model_path = f'runs:/{callback.run.info.run_id}/model' loaded_model = mlflow.pyfunc.load_model(model_path) assert 'ludwig' in loaded_model.metadata.flavors flavor = loaded_model.metadata.flavors['ludwig'] def compare_features(key): assert len(model.config[key]) == len(flavor['ludwig_schema'][key]) for feature, schema_feature in zip(model.config[key], flavor['ludwig_schema'][key]): assert feature['name'] == schema_feature['name'] assert feature['type'] == schema_feature['type'] compare_features('input_features') compare_features('output_features') test_df = pd.read_csv(test_csv) pred_df = loaded_model.predict(test_df) assert (pred_df.equals(expected_df))
def test_mlflow_callback(tmpdir): epochs = 2 batch_size = 8 num_examples = 32 input_features = [sequence_feature(reduce_output="sum")] output_features = [category_feature(vocab_size=2, reduce_input="sum")] config = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "output_size": 14}, TRAINER: {"epochs": epochs, "batch_size": batch_size}, } data_csv = generate_data( input_features, output_features, os.path.join(tmpdir, "train.csv"), num_examples=num_examples ) val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "validation.csv")) test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "test.csv")) mlflow_uri = f"file://{tmpdir}/mlruns" mlflow.set_tracking_uri(mlflow_uri) client = MlflowClient(tracking_uri=mlflow_uri) exp_name = "mlflow_test" callback = MlflowCallback() wrapped_callback = mock.Mock(wraps=callback) model = LudwigModel(config, callbacks=[wrapped_callback], backend=FakeRemoteBackend()) model.train(training_set=data_csv, validation_set=val_csv, test_set=test_csv, experiment_name=exp_name) expected_df, _ = model.predict(test_csv) # Check mlflow artifacts assert callback.experiment_id is not None assert callback.run is not None experiment = mlflow.get_experiment_by_name(exp_name) assert experiment.experiment_id == callback.experiment_id df = mlflow.search_runs([experiment.experiment_id]) assert len(df) == 1 run_id = df.run_id[0] assert run_id == callback.run.info.run_id run = mlflow.get_run(run_id) assert run.info.status == "FINISHED" assert wrapped_callback.on_trainer_train_setup.call_count == 1 assert wrapped_callback.on_trainer_train_teardown.call_count == 1 artifacts = [f.path for f in client.list_artifacts(callback.run.info.run_id, "")] local_dir = f"{tmpdir}/local_artifacts" os.makedirs(local_dir) assert "config.yaml" in artifacts local_config_path = client.download_artifacts(callback.run.info.run_id, "config.yaml", local_dir) with open(local_config_path) as f: config_artifact = yaml.safe_load(f) assert config_artifact == config model_path = f"runs:/{callback.run.info.run_id}/model" loaded_model = mlflow.pyfunc.load_model(model_path) assert "ludwig" in loaded_model.metadata.flavors flavor = loaded_model.metadata.flavors["ludwig"] def compare_features(key): assert len(model.config[key]) == len(flavor["ludwig_schema"][key]) for feature, schema_feature in zip(model.config[key], flavor["ludwig_schema"][key]): assert feature["name"] == schema_feature["name"] assert feature["type"] == schema_feature["type"] compare_features("input_features") compare_features("output_features") test_df = pd.read_csv(test_csv) pred_df = loaded_model.predict(test_df) assert pred_df.equals(expected_df)
for key, value in gen_inputs.items(): log_param(key, value) print(output) log_artifacts(output.replace("file://", "")) print(run) # if run is None: # print("Something wrong at generation") # for key, value in a.items(): # log_param(key, value) ## さきほど取得しておいた、runidをもとに、artifactsを取得するようにする generation_run_id = run.info.run_id print("generation_run_id=["+generation_run_id+"]") generation_artifacts_localpath = client.download_artifacts(run_id=generation_run_id, path="") print("download from Azure worked!!") print(generation_artifacts_localpath) #print("generation_artifacts_localpath=["+generation_artifacts_localpath+"]") # # generation_artifacts_path = _get_or_run("analysis1", {"generation": generation_run.info.run_id, "threshold": threshold, "min_sigma": min_sigma}, git_commit) #a["artifacts_pathname"] = generation_artifacts_localpath run = None with mlflow.start_run(run_name='analysis1') as run: run = mlflow.active_run() output = kaizu_analysis1(ana1_inputs) for key, value in ana1_inputs.items(): log_param(key, value) print(output) log_artifacts(output["artifacts"].replace("file://", "")) print(run)
def download_and_load_models(models, client=None, load=None, use_cached=False): """ Load model from mlflow registry :param use_cached: if set, use cached downloaded model :param autoload: globally enable/disable model autoloading after download :param client: mlflow client instance or None model: loaded model :models: model descriptors to load from mlflow tracking server Await in parameters a list of dict: [{ 'uri': 'models:/uri/Stage' 'use_cached': if set override global use_cached 'path': '/local/path/to/Save/Downloadto' 'load' if set: callable to load the model, default to keras set to False to disable this specific model loading }] return the models dict with injected data: loaded: True if loading has been done model: model object if loaded downloaded: True if downloading has been done model_path: path from within the download that was considered as a model """ if client is None: client = MlflowClient() for m in [a for a in models]: mdata = models[m] uri = mdata["uri"] mdata["model_path"] = None mdata["model"] = None mdata["loaded"] = False mdata["downloaded"] = False match = MODEL_URI_RE.search(uri) gr = match.groupdict() version = [ v for v in client.search_registered_models( filter_string=f"name='{gr['model']}'")[0].latest_versions if v.current_stage == gr["stage"] ][0] vmatch = SOURCE_RE.search(version.source) vgr = vmatch.groupdict() mlflowdir = mdata["path"] rm(mlflowdir) muse_cached = mdata.get("use_cached", use_cached) try: if not muse_cached: raise ModelNotFound("force download") mdata["model_path"] = find_model(mlflowdir) except ModelNotFound: create_dirs(mlflowdir) client.download_artifacts(vgr["runid"], vgr["path"], mlflowdir) mdata["model_path"] = find_model(mlflowdir) mdata["downloaded"] = True aload = mdata.get("load", load) if aload is None: aload = load_model if not aload: continue mdata["model"] = aload(mdata["model_path"]) mdata["loaded"] = True return models
class ModeWrapperApp(Flask): def run(self, host=None, port=None, debug=None, load_dotenv=True, **options): """ Get env variables """ self.model_name = os.environ['MODEL_NAME'] self.model_version = os.environ['MODEL_VERSION'] self.run_id = os.environ["MODEL_RUN_ID"] self.model_source_uri = os.environ["MODEL_SOURCE"] """ String literals """ # we replace -version-1 in model name since the yaml don't come with versions self.model_yaml_file = self.model_name.strip().replace( "-" + self.model_version, "") + ".yaml" self.model_yaml_file_caps_case = self.model_name.strip().replace( "-" + self.model_version, "").upper() + ".yaml" self.model_yaml_file_small_case = self.model_name.strip().replace( "-" + self.model_version, "").lower() + ".yaml" self.model_y_test_file_name = "y_test.csv" """ # Fetch model from mlflow registry using model source uri e.g. s3://mlflow-experiments/0/cfd8c04976a04d24b9d2ded788903beb/artifacts/model """ self.model = mlflow.pytorch.load_model(model_uri=self.model_source_uri) """ Create mlflow client which is used to download model artifacts """ self.mlflowClient = MlflowClient() """ Download artifacts from mlflow """ local_dir = os.getcwd() if not os.path.exists(local_dir): os.mkdir(local_dir) local_path = self.mlflowClient.download_artifacts( self.run_id, "features", local_dir) """ Confirm downloaded files """ print("Artifacts downloaded in: {}".format(local_path)) print("Artifacts: {}".format(os.listdir(local_path))) """ Load model yaml from mode features directory """ model_yaml_dir_path_caps = local_path + '/' + str( self.model_yaml_file_caps_case).strip() model_yaml_dir_path_small = local_path + '/' + str( self.model_yaml_file_small_case).strip() self.model_y_test_file = local_path + '/' + self.model_y_test_file_name self.features_local_path = local_path if os.path.exists(model_yaml_dir_path_caps): expt_conf = du.yaml.load( open(model_yaml_dir_path_caps).read().format( DATA_DIR=local_path), Loader=du._Loader) elif os.path.exists(model_yaml_dir_path_small): expt_conf = du.yaml.load( open(model_yaml_dir_path_small).read().format( DATA_DIR=local_path), Loader=du._Loader) """ Load model data """ preprocessor = StandardScaler() train_filter = [ filter_preprocessor(cols=expt_conf['numerical'], preprocessor=preprocessor, refit=True), filter_fillna(fill_value=expt_conf['normal_values'], time_order_col=expt_conf['time_order_col']) ] transform = ptd.transform_drop_cols( cols_to_drop=expt_conf['time_order_col']) self.model_dataset = ptd.BaseDataset( tgt_file=expt_conf['test']['tgt_file'], feat_file=expt_conf['test']['feat_file'], idx_col=expt_conf['idx_cols'], tgt_col=expt_conf['tgt_col'], feat_columns=expt_conf['feat_cols'], time_order_col=expt_conf['time_order_col'], category_map=expt_conf['category_map'], transform=transform, filter=train_filter, ) super(ModeWrapperApp, self).run(host=host, port=port, debug=debug, load_dotenv=load_dotenv, **options)
def main(): print(os.environ) print('MODEL VERSION: ' + os.environ.get('MODEL_VERSION')) client = MlflowClient() client.download_artifacts(os.environ.get('MODEL_VERSION'), "model", '.')
run_id = os.getenv("MLFLOW_RUN_ID") if run_id is None: print("Loading model and histogram from local files") model_file = "./model.pkl" histogram_file = "./histogram.prom" else: print("Loading model and histogram from MLflow") from mlflow.tracking import MlflowClient client = MlflowClient() local_dir = "/tmp/artifact_downloads" if not os.path.exists(local_dir): os.mkdir(local_dir) local_path = client.download_artifacts(run_id, "", local_dir) print("Artifacts downloaded in: {}".format(local_path)) print("Artifacts: {}".format(os.listdir(local_path))) model_file = f"{local_path}/model/model.pkl" histogram_file = f"{local_path}/histogram.txt" with open(model_file, "rb") as f: model = pickle.load(f) monitor = ModelMonitoringService(baseline_collector=BaselineMetricCollector( path=histogram_file)) app = Flask(__name__)
import os import uuid import shutil import pandas as pd from mlflow.tracking import MlflowClient # Download input data from mlflow into a pandas DataFrame # create temp directory to download data temp_dir = os.path.join(os.environ["SPARK_LOCAL_DIRS"], str(uuid.uuid4())[:8]) os.makedirs(temp_dir) # download the artifact and read it client = MlflowClient() training_data_path = client.download_artifacts( "0df1633dc8eb4dc0ac25655e9c851687", "data", temp_dir) df = pd.read_parquet(os.path.join(training_data_path, "training_data")) # delete the temp data shutil.rmtree(temp_dir) target_col = "tipped" # COMMAND ---------- # MAGIC %md # MAGIC ## Profiling Results # COMMAND ---------- from pandas_profiling import ProfileReport