def _load(self) -> Any:  # pragma: no cover
                if self.run_id:
                    # if no run_id is specified, we take the artifact from the local path rather that the active run:
                    # there are a lot of chances that it has not been saved yet!

                    mlflow_client = MlflowClient()

                    if hasattr(self, "_version"):
                        # all kedro datasets inherits from AbstractVersionedDataSet
                        local_path = self._get_load_path()
                    elif hasattr(self, "_filepath"):
                        # in case custom datasets inherits from AbstractDataSet without versioning
                        local_path = self._filepath  # pragma: no cover
                    elif hasattr(self, "_path"):
                        # special datasets with a folder instead of a specifi files like PartitionedDataSet
                        local_path = Path(self._path)

                    artifact_path = ((self.artifact_path /
                                      local_path.name).as_posix() if
                                     self.artifact_path else local_path.name)

                    mlflow_client.download_artifacts(
                        run_id=self.run_id,
                        path=artifact_path,
                        dst_path=local_path.parent.as_posix(
                        ),  # must be a **local** **directory**
                    )

                # finally, read locally
                return super()._load()
Esempio n. 2
0
def test_invalid_load():
    client = MlflowClient(tracking_uri=str(TEST_MLRUNS_PATH))
    with tempfile.TemporaryDirectory() as td:
        # should crash
        try:
            client.download_artifacts("6e6280f331a94bf388fa9d0de0ecee99",
                                      "model/model.pkl", td)
            crashed = False
        except:
            crashed = True
    assert crashed
Esempio n. 3
0
def test_fixed_load():
    # first fix in tempdir
    with tempfile.TemporaryDirectory() as tmpdir:
        wd = Path(tmpdir)
        # copy folder from tests to tmp as testruns (changed from mlruns)
        path_to_store = wd.joinpath("testruns")
        shutil.copytree(TEST_MLRUNS_PATH, path_to_store)
        # provide mlruns here because this is in metadata, current folder name should be autodetected
        assert fix.fix_meta(path_to_store, "mlruns")

        # now try to read
        client = MlflowClient(tracking_uri=str(path_to_store))

        # should run without error (file must be found)
        client.download_artifacts("6e6280f331a94bf388fa9d0de0ecee99",
                                  "model/model.pkl", tmpdir)
Esempio n. 4
0
def download(model_name: str, out_path: str):
    import json
    client = MlflowClient(tracking_uri="databricks")
    for rm in client.list_registered_models():
        if rm.name == model_name:
            for version in rm.latest_versions:
                if version.current_stage == "Staging":
                    print(f"Downloading verion {version.version} of {model_name} with run_id {version.run_id}...")
                    client.download_artifacts(version.run_id, "model/data/model.pt", '.')
                    meta = {
                        "version": version.version,
                        "run_id": version.run_id,
                        "name": model_name
                    }
                    with open("model_meta.json", "w") as m:
                        json.dump(meta, m)
def main():
    parser = argparse.ArgumentParser(
        description="Execute python scripts in Databricks")
    parser.add_argument("-o",
                        "--output_local_path",
                        help="Output path where the artifacts will be written",
                        required=True)
    parser.add_argument("-m",
                        "--model_name",
                        help="Model Registry Name",
                        required=True)
    args = parser.parse_args()

    model_name = args.model_name
    output_local_path = args.output_local_path

    cli_profile_name = "registry"
    # TODO: Document that we assume that the registry profile will be created in the local machine:
    # dbutils.fs.put(f"file:///root/.databrickscfg", f"[{cli_profile_name}]\nhost={shard}\ntoken={token}",
    #                overwrite=True)

    TRACKING_URI = f"databricks://{cli_profile_name}"
    print(f"TRACKING_URI: {TRACKING_URI}")
    artifact_path = 'model'
    from mlflow.tracking import MlflowClient
    remote_client = MlflowClient(tracking_uri=TRACKING_URI)
    mlflow.set_tracking_uri(TRACKING_URI)
    # client = mlflow.tracking.MlflowClient()
    latest_model = remote_client.get_latest_versions(name=model_name,
                                                     stages=["staging"])
    print(f"Latest Model: {latest_model}")
    run_id = latest_model[0].run_id
    artifact_uri = artifact_utils.get_artifact_uri(run_id)
    print(f"artifact_uri: {artifact_uri}")
    model_uri = f"runs:/{latest_model[0].run_id}/{artifact_path}"
    print(f"model_uri: {model_uri}")

    print(f"Downloading model artifacts to : {output_local_path}")
    remote_client.download_artifacts(run_id=run_id,
                                     path=artifact_path,
                                     dst_path=output_local_path)
def test_is_versioned_dataset_logged_correctly_in_mlflow(
        tmp_path, tracking_uri, df1):
    """Check if versioned dataset is logged correctly in MLflow as artifact.

    For versioned datasets just artifacts from current run should be logged.
    """
    mlflow.set_tracking_uri(tracking_uri.as_uri())
    mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri())

    mlflow.start_run()

    run_id = mlflow.active_run().info.run_id
    active_run_id = mlflow.active_run().info.run_id

    mlflow_csv_dataset = MlflowArtifactDataSet(
        data_set=dict(type=CSVDataSet,
                      filepath=(tmp_path / "df1.csv").as_posix(),
                      versioned=True),
        run_id=run_id,
    )
    mlflow_csv_dataset.save(df1)

    run_artifacts = [
        fileinfo.path
        for fileinfo in mlflow_client.list_artifacts(run_id=run_id)
    ]

    # Check if just one artifact was created in given run.
    assert len(run_artifacts) == 1

    artifact_path = mlflow_client.download_artifacts(run_id=run_id,
                                                     path=run_artifacts[0])

    # Check if saved artifact is file and not folder where versioned datasets are stored.
    assert Path(artifact_path).is_file()

    assert (mlflow.active_run().info.run_id == active_run_id
            if mlflow.active_run() else True
            )  # if a run was opened before saving, it must be reopened
    assert df1.equals(mlflow_csv_dataset.load())  # and must loadable

    mlflow.end_run()
Esempio n. 7
0
from src.models.predict import predict_file, predict_smavra
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

RUN_ID = "4d8ddb41e7f340c182a6a62699502d9f"
DEVICE = "cuda"
TRAIN_DATA_PATH = "data/processed/resmed/train/train.pt"

mlflow_client = MlflowClient()

# get processing info
with tempfile.TemporaryDirectory() as tmp_dir:
    with open(
            mlflow_client.download_artifacts(
                run_id=RUN_ID,
                path="config/preprocessing_config.json",
                dst_path=tmp_dir), "r") as f:
        preprocessing_config = json.load(f)

# load model
smavra = mlflow.pytorch.load_model('runs:/' + RUN_ID + '/model',
                                   map_location="cuda:0")
smavra.eval()

if DEVICE == "cuda":
    smavra.cuda()
else:
    smavra.cpu()

pred_tensor = torch.load(TRAIN_DATA_PATH)
Esempio n. 8
0
import os
import mlflow
from mlflow.tracking import MlflowClient

if __name__ == "__main__":

    features = "rooms, zipcode, median_price, school_rating, transport"
    with open("features.txt", 'w') as f:
        f.write(features)

    # Log artifacts
    with mlflow.start_run() as run:
        mlflow.log_artifact("features.txt", artifact_path="features")

    client = MlflowClient()
    local_dir = "/tmp/artifact_downloads"
    if not os.path.exists(local_dir):
        os.mkdir(local_dir)
    local_path = client.download_artifacts(run.info.run_id, "features",
                                           local_dir)
    print("Artifacts downloaded at : {}".format(local_path))
    print("Artifacts: {}".format(os.listdir(local_path)))
Esempio n. 9
0
def epoch_attention(
        run_id: str,
        session: str,
        epoch_nr: int,
        input_dir: str = "data/processed/resmed/score",
        preprocessing_config: str = "config/preprocessing_config.json",
        seq_len: int = 750,
        device: str = "cuda"):
    """predict using trained smavra network

    Args:
        run_id (str): run_id from mlflow experiment
        input_dir (str, optional): input directory holding the data to be
            predicted. Defaults to "data/processed/resmed/score".
        output_dir (str, optional): the directory the predictions should be
            written to. Defaults to "data/scored/resmed".
        preprocessing_config (str, optional): location of preprocessing config
            in mlflow. Defaults to "config/preprocessing_config.json".
        seq_len (int, optional): sequence lengths -> workaround.
            Defaults to 750.
        device (str, optional): device to run inference on.
    """

    logger = logging.getLogger(__name__)

    logger.info(f"Reading file {session}.")
    # clean up output_dir
    session_file = Path(os.path.join(input_dir,
                                     f"{session}_0_HRD.edf.parquet"))

    if not session_file.exists():
        logger.error(f"file {str(session_file)} does not exist.")

    attention_path = Path(os.path.join("reports/figures", run_id, "attention"))
    # remove directories if they exist
    if not attention_path.exists():
        attention_path.mkdir(parents=True, exist_ok=True)

    logger.info("Setting up model.")
    mlflow_client = MlflowClient()

    # get processing info
    with tempfile.TemporaryDirectory() as tmp_dir:
        with open(
                mlflow_client.download_artifacts(
                    run_id=run_id,
                    path="config/preprocessing_config.json",
                    dst_path=tmp_dir), "r") as f:
            preprocessing_config = json.load(f)

    # load model
    smavra = mlflow.pytorch.load_model('runs:/' + run_id + '/model',
                                       map_location="cuda:0")

    smavra.eval()
    if device == "cuda":
        smavra.cuda()
    else:
        smavra.cpu()

    logger.info("Getting attention.")

    pred_df = pq.read_table(session_file).to_pandas()

    pred_tensor = torch.Tensor(pred_df.iloc[:, :3].values)
    pred_tensor = reshape_resmed_tensor(pred_tensor, seq_len)

    dataset = ResmedDatasetEpoch(
        data=pred_tensor,
        batch_size=1,
        device=device,
        means=torch.Tensor(preprocessing_config["means"]),
        stds=torch.Tensor(preprocessing_config["stds"]))

    epoch = dataset[epoch_nr].unsqueeze(0).to(device)

    # get prediction
    h_t, latent, attention_weight, attention, lengths = \
        smavra.encode(epoch)

    _, n_heads, _, _ = attention_weight.shape
    attention_weights = {
        j: attention_weight[0, j, :, :].cpu().detach().numpy()
        for j in range(n_heads)
    }

    return (attention_weights)
Esempio n. 10
0
def predict_smavra(
        run_id: str,
        input_dir: str = "data/processed/resmed/score",
        output_dir: str = "data/scored/resmed",
        preprocessing_config: str = "config/preprocessing_config.json",
        seq_len: int = 750,
        device: str = "cuda",
        explain_latent: bool = True,
        explain_attention: bool = False,
        score_file_pattern: str = "*"):
    """predict using trained smavra network

    Args:
        run_id (str): run_id from mlflow experiment
        input_dir (str, optional): input directory holding the data to be
            predicted. Defaults to "data/processed/resmed/score".
        output_dir (str, optional): the directory the predictions should be
            written to. Defaults to "data/scored/resmed".
        preprocessing_config (str, optional): location of preprocessing config
            in mlflow. Defaults to "config/preprocessing_config.json".
        seq_len (int, optional): sequence lengths -> workaround.
            Defaults to 750.
        device (str, optional): device to run inference on.
    """
    column_order = ["mask_press", "resp_flow", "delivered_volum"]

    logger = logging.getLogger(__name__)

    logger.info("Preparing directory structure.")
    # clean up output_dir
    output_score_dir = Path(os.path.join(output_dir, "score", run_id))
    output_explain_latent_dir = Path(
        os.path.join(output_dir, "explain", "latent", run_id))
    output_explain_attention_dir = Path(
        os.path.join(output_dir, "explain", "attention", run_id))

    # remove directories if they exist
    if output_score_dir.exists():
        shutil.rmtree(output_score_dir)

    output_score_dir.mkdir(parents=True, exist_ok=True)

    # remove directories if they exist
    if output_explain_latent_dir.exists():
        shutil.rmtree(output_explain_latent_dir)

    output_explain_latent_dir.mkdir(parents=True, exist_ok=True)

    # remove directories if they exist
    if output_explain_attention_dir.exists():
        shutil.rmtree(output_explain_attention_dir)

    output_explain_attention_dir.mkdir(parents=True, exist_ok=True)

    logger.info("Setting up model.")
    mlflow_client = MlflowClient()

    # get processing info
    with tempfile.TemporaryDirectory() as tmp_dir:
        with open(
                mlflow_client.download_artifacts(
                    run_id=run_id,
                    path="config/preprocessing_config.json",
                    dst_path=tmp_dir), "r") as f:
            preprocessing_config = json.load(f)

    # load model
    smavra = mlflow.pytorch.load_model('runs:/' + run_id + '/model',
                                       map_location="cuda:0")

    smavra.eval()
    if device == "cuda":
        smavra.cuda()
    else:
        smavra.cpu()

    logger.info("Start with predicition.")
    for score_file_path in Path(input_dir).glob(score_file_pattern):

        pred_df = pq.read_table(os.path.join(score_file_path)).to_pandas()

        pred_tensor = torch.Tensor(pred_df.iloc[:, :3].values)
        pred_tensor = reshape_resmed_tensor(pred_tensor, seq_len)

        score_dataset = ResmedDatasetEpoch(
            data=pred_tensor,
            batch_size=1,
            device=device,
            means=torch.Tensor(preprocessing_config["means"]),
            stds=torch.Tensor(preprocessing_config["stds"]))
        preds, latents, attention_weights = predict_file(
            model=smavra,
            dataset=score_dataset,
            file_path=score_file_path,
            explain_latent=explain_latent,
            explain_attention=explain_attention,
            seq_len=seq_len,
            column_order=column_order)

        if len(preds) > 1:
            preds = pd.concat(preds, ignore_index=True)
        else:
            preds = preds[0]

        preds = pd.concat([pred_df, preds], axis=1)

        # write predicitons
        table = pa.Table.from_pandas(preds)
        file_name = os.path.basename(score_file_path)
        pq.write_table(table, os.path.join(output_score_dir, file_name))

        # EXPLAINABILITY
        # write latent
        if explain_latent:
            latents = np.stack(latents, 0)
            latent_cols = [f"latent_{i}" for i in range(latents.shape[1] - 3)]
            df = pd.DataFrame(latents,
                              columns=latent_cols +
                              ["epoch_loss", "epoch", "epoch_class"])

            df["file_name"] = os.path.basename(score_file_path)[:15]
            table = pa.Table.from_pandas(df)

            file_name = os.path.join(output_explain_latent_dir,
                                     os.path.basename(score_file_path))
            pq.write_table(table, file_name)

        # write attention
        if explain_attention:
            with open(
                    os.path.join(output_explain_attention_dir,
                                 os.path.basename(score_file_path) + ".pkl"),
                    "wb") as f:
                pk.dump(attention_weights, f)
Esempio n. 11
0
os.system('nvidia-smi')

minio_client = minio.Minio('minio:9000',
                           access_key='admin',
                           secret_key='password',
                           secure=False)

from mlflow.tracking import MlflowClient

mlflow_client = MlflowClient()

configs_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                            'configs')

mlflow_client.download_artifacts(run_id,
                                 'configs',
                                 dst_path=os.path.dirname(
                                     os.path.abspath(__file__)))

print(os.listdir('configs'))

train = None
with open(os.path.join(configs_path, 'train.json'), 'r') as f:
    train = json.load(f)

if train is None:
    exit(-1)

test = None
with open(os.path.join(configs_path, 'test.json'), 'r') as f:
    test = json.load(f)
Esempio n. 12
0
import os
import uuid
import shutil
import pandas as pd

from mlflow.tracking import MlflowClient

# Create temp directory to download input data from MLflow
input_temp_dir = os.path.join(os.environ["SPARK_LOCAL_DIRS"],
                              str(uuid.uuid4())[:8])
os.makedirs(input_temp_dir)

# Download the artifact and read it into a pandas DataFrame
input_client = MlflowClient()
input_data_path = input_client.download_artifacts(
    "0df1633dc8eb4dc0ac25655e9c851687", "data", input_temp_dir)
df_loaded = pd.read_parquet(os.path.join(input_data_path, "training_data"))

# Delete the temp data
shutil.rmtree(input_temp_dir)

# Preview data
df_loaded.head(5)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Preprocessors

# COMMAND ----------
Esempio n. 13
0
class CustomerMlflowClient:
    def __init__(self, tracking_server_uri, experiment_name):
        try:
            self.mlflow_client = MlflowClient(tracking_server_uri)
            logger.info("established mlflow rest-api client")
        except Exception as e:
            logger.error(str(e))

        try:
            self.experiment_id = self.set_experiment(experiment_name)
            logger.info("started mlflow experiment {} with id {}".format(
                experiment_name, self.experiment_id))
        except Exception as e:
            logger.error(str(e))

    def logger(self,
               params,
               metrics,
               local_artifact_path,
               mlflow_artifact_path=None):
        run = self.mlflow_client.create_run(self.experiment_id)
        run_id = run.info.run_id
        logger.info("staring new run with id: {}".format(run_id))
        logger.info("logging parameter to mlflow tracking server")
        self.log_params(run_id, params)
        logger.info("successfully logged parameter to mlflow tracking server")
        logger.info("logging model metrics to mlflow tracking server")
        self.log_metrics(run_id, metrics)
        logger.info(
            "successfully logged model metrics to mlflow tracking server")
        logger.info("logging model artifact to mlflow tracking server")
        self.log_artifact(run_id, local_artifact_path)
        logger.info(
            "successfully logged model artifact to mlflow tracking server")
        logger.info("exiting run with id: {}".format(run_id))

    def set_experiment(self, experiment_name):
        experiment = self.mlflow_client.get_experiment_by_name(experiment_name)
        if experiment is None:
            return self.mlflow_client.create_experiment(experiment_name)
        else:
            return experiment.experiment_id

    def log_params(self, run_id: int, params):
        for key, value in params.items():
            self.mlflow_client.log_param(run_id=run_id, key=key, value=value)

    def log_metrics(self, run_id: int, metrics):
        for key, value in metrics.items():
            self.mlflow_client.log_metric(run_id=run_id, key=key, value=value)

    def log_artifact(self, run_id: int, artifact):
        self.mlflow_client.log_artifact(run_id=run_id, local_path=artifact)

    def get_latest_artifact(self, dest_path):
        run_info = self.mlflow_client.list_run_infos(self.experiment_id)
        latest_run_info = run_info[0]
        file_name = self.mlflow_client.list_artifacts(
            run_id=latest_run_info.run_id)[0].path
        complete_artifact_path = latest_run_info.artifact_uri + '/' + file_name
        self.mlflow_client.download_artifacts(run_id=latest_run_info.run_id,
                                              path=complete_artifact_path,
                                              dst_path=dest_path)
        return dest_path + file_name
Esempio n. 14
0
def test_mlflow_callback(tmpdir):
    epochs = 2
    batch_size = 8
    num_examples = 32

    input_features = [sequence_feature(reduce_output='sum')]
    output_features = [category_feature(vocab_size=2, reduce_input='sum')]

    config = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat',
            'fc_size': 14
        },
        'training': {
            'epochs': epochs,
            'batch_size': batch_size
        },
    }

    data_csv = generate_data(input_features,
                             output_features,
                             os.path.join(tmpdir, 'train.csv'),
                             num_examples=num_examples)
    val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, 'validation.csv'))
    test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, 'test.csv'))

    mlflow_uri = f'file://{tmpdir}/mlruns'
    mlflow.set_tracking_uri(mlflow_uri)
    client = MlflowClient(tracking_uri=mlflow_uri)

    exp_name = 'mlflow_test'
    callback = MlflowCallback()

    model = LudwigModel(config, callbacks=[callback])
    model.train(training_set=data_csv,
                validation_set=val_csv,
                test_set=test_csv,
                experiment_name=exp_name)
    expected_df, _ = model.predict(test_csv)

    # Check mlflow artifacts
    assert callback.experiment_id is not None
    assert callback.run is not None

    experiment = mlflow.get_experiment_by_name(exp_name)
    assert experiment.experiment_id == callback.experiment_id

    df = mlflow.search_runs([experiment.experiment_id])
    assert len(df) == 1

    run_id = df.run_id[0]
    assert run_id == callback.run.info.run_id

    artifacts = [
        f.path for f in client.list_artifacts(callback.run.info.run_id, "")
    ]
    local_dir = f'{tmpdir}/local_artifacts'
    os.makedirs(local_dir)

    assert 'config.yaml' in artifacts
    local_config_path = client.download_artifacts(callback.run.info.run_id,
                                                  "config.yaml", local_dir)

    with open(local_config_path, 'r') as f:
        config_artifact = yaml.safe_load(f)
    assert config_artifact == config

    model_path = f'runs:/{callback.run.info.run_id}/model'
    loaded_model = mlflow.pyfunc.load_model(model_path)

    assert 'ludwig' in loaded_model.metadata.flavors
    flavor = loaded_model.metadata.flavors['ludwig']

    def compare_features(key):
        assert len(model.config[key]) == len(flavor['ludwig_schema'][key])
        for feature, schema_feature in zip(model.config[key],
                                           flavor['ludwig_schema'][key]):
            assert feature['name'] == schema_feature['name']
            assert feature['type'] == schema_feature['type']

    compare_features('input_features')
    compare_features('output_features')

    test_df = pd.read_csv(test_csv)
    pred_df = loaded_model.predict(test_df)
    assert (pred_df.equals(expected_df))
Esempio n. 15
0
def test_mlflow_callback(tmpdir):
    epochs = 2
    batch_size = 8
    num_examples = 32

    input_features = [sequence_feature(reduce_output="sum")]
    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {"type": "concat", "output_size": 14},
        TRAINER: {"epochs": epochs, "batch_size": batch_size},
    }

    data_csv = generate_data(
        input_features, output_features, os.path.join(tmpdir, "train.csv"), num_examples=num_examples
    )
    val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "validation.csv"))
    test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "test.csv"))

    mlflow_uri = f"file://{tmpdir}/mlruns"
    mlflow.set_tracking_uri(mlflow_uri)
    client = MlflowClient(tracking_uri=mlflow_uri)

    exp_name = "mlflow_test"
    callback = MlflowCallback()
    wrapped_callback = mock.Mock(wraps=callback)

    model = LudwigModel(config, callbacks=[wrapped_callback], backend=FakeRemoteBackend())
    model.train(training_set=data_csv, validation_set=val_csv, test_set=test_csv, experiment_name=exp_name)
    expected_df, _ = model.predict(test_csv)

    # Check mlflow artifacts
    assert callback.experiment_id is not None
    assert callback.run is not None

    experiment = mlflow.get_experiment_by_name(exp_name)
    assert experiment.experiment_id == callback.experiment_id

    df = mlflow.search_runs([experiment.experiment_id])
    assert len(df) == 1

    run_id = df.run_id[0]
    assert run_id == callback.run.info.run_id

    run = mlflow.get_run(run_id)
    assert run.info.status == "FINISHED"
    assert wrapped_callback.on_trainer_train_setup.call_count == 1
    assert wrapped_callback.on_trainer_train_teardown.call_count == 1

    artifacts = [f.path for f in client.list_artifacts(callback.run.info.run_id, "")]
    local_dir = f"{tmpdir}/local_artifacts"
    os.makedirs(local_dir)

    assert "config.yaml" in artifacts
    local_config_path = client.download_artifacts(callback.run.info.run_id, "config.yaml", local_dir)

    with open(local_config_path) as f:
        config_artifact = yaml.safe_load(f)
    assert config_artifact == config

    model_path = f"runs:/{callback.run.info.run_id}/model"
    loaded_model = mlflow.pyfunc.load_model(model_path)

    assert "ludwig" in loaded_model.metadata.flavors
    flavor = loaded_model.metadata.flavors["ludwig"]

    def compare_features(key):
        assert len(model.config[key]) == len(flavor["ludwig_schema"][key])
        for feature, schema_feature in zip(model.config[key], flavor["ludwig_schema"][key]):
            assert feature["name"] == schema_feature["name"]
            assert feature["type"] == schema_feature["type"]

    compare_features("input_features")
    compare_features("output_features")

    test_df = pd.read_csv(test_csv)
    pred_df = loaded_model.predict(test_df)
    assert pred_df.equals(expected_df)
    for key, value in gen_inputs.items():
        log_param(key, value)
    print(output)
    log_artifacts(output.replace("file://", ""))
    print(run)

# if run is None:
#     print("Something wrong at generation")
# for key, value in a.items():
#     log_param(key, value)

## さきほど取得しておいた、runidをもとに、artifactsを取得するようにする

generation_run_id = run.info.run_id
print("generation_run_id=["+generation_run_id+"]")
generation_artifacts_localpath = client.download_artifacts(run_id=generation_run_id, path="")
print("download from Azure worked!!")
print(generation_artifacts_localpath)
#print("generation_artifacts_localpath=["+generation_artifacts_localpath+"]")
# # generation_artifacts_path = _get_or_run("analysis1", {"generation": generation_run.info.run_id, "threshold": threshold, "min_sigma": min_sigma}, git_commit)

#a["artifacts_pathname"] = generation_artifacts_localpath
run = None
with mlflow.start_run(run_name='analysis1') as run:
    run = mlflow.active_run()
    output = kaizu_analysis1(ana1_inputs)
    for key, value in ana1_inputs.items():
        log_param(key, value)
    print(output)
    log_artifacts(output["artifacts"].replace("file://", ""))
    print(run)
def download_and_load_models(models, client=None, load=None, use_cached=False):
    """
    Load model from mlflow registry

    :param use_cached: if set, use cached downloaded model
    :param autoload: globally enable/disable model autoloading after download
    :param client: mlflow client instance or None
        model: loaded model
        :models: model descriptors to load from mlflow tracking server
         Await in parameters a list of dict:
           [{
             'uri': 'models:/uri/Stage'
             'use_cached': if set override global use_cached
             'path': '/local/path/to/Save/Downloadto'
             'load' if set: callable to load the model, default to keras
                    set to False to disable this specific model loading
           }]

    return the models dict with injected data:
        loaded: True if loading has been done
        model: model object if loaded
        downloaded: True if downloading has been done
        model_path: path from within the download that was considered as a model
    """
    if client is None:
        client = MlflowClient()
    for m in [a for a in models]:
        mdata = models[m]
        uri = mdata["uri"]
        mdata["model_path"] = None
        mdata["model"] = None
        mdata["loaded"] = False
        mdata["downloaded"] = False
        match = MODEL_URI_RE.search(uri)
        gr = match.groupdict()
        version = [
            v for v in client.search_registered_models(
                filter_string=f"name='{gr['model']}'")[0].latest_versions
            if v.current_stage == gr["stage"]
        ][0]
        vmatch = SOURCE_RE.search(version.source)
        vgr = vmatch.groupdict()
        mlflowdir = mdata["path"]
        rm(mlflowdir)
        muse_cached = mdata.get("use_cached", use_cached)
        try:
            if not muse_cached:
                raise ModelNotFound("force download")
            mdata["model_path"] = find_model(mlflowdir)
        except ModelNotFound:
            create_dirs(mlflowdir)
            client.download_artifacts(vgr["runid"], vgr["path"], mlflowdir)
            mdata["model_path"] = find_model(mlflowdir)
            mdata["downloaded"] = True
        aload = mdata.get("load", load)
        if aload is None:
            aload = load_model
        if not aload:
            continue
        mdata["model"] = aload(mdata["model_path"])
        mdata["loaded"] = True
    return models
Esempio n. 18
0
class ModeWrapperApp(Flask):
    def run(self,
            host=None,
            port=None,
            debug=None,
            load_dotenv=True,
            **options):
        """
    Get env variables
    """
        self.model_name = os.environ['MODEL_NAME']
        self.model_version = os.environ['MODEL_VERSION']
        self.run_id = os.environ["MODEL_RUN_ID"]
        self.model_source_uri = os.environ["MODEL_SOURCE"]
        """
    String literals
    """
        # we replace -version-1 in model name since the yaml don't come with versions
        self.model_yaml_file = self.model_name.strip().replace(
            "-" + self.model_version, "") + ".yaml"
        self.model_yaml_file_caps_case = self.model_name.strip().replace(
            "-" + self.model_version, "").upper() + ".yaml"
        self.model_yaml_file_small_case = self.model_name.strip().replace(
            "-" + self.model_version, "").lower() + ".yaml"
        self.model_y_test_file_name = "y_test.csv"
        """
    # Fetch model from mlflow registry using model source uri e.g. s3://mlflow-experiments/0/cfd8c04976a04d24b9d2ded788903beb/artifacts/model
    """
        self.model = mlflow.pytorch.load_model(model_uri=self.model_source_uri)
        """
    Create mlflow client which is used to download model artifacts
    """
        self.mlflowClient = MlflowClient()
        """
    Download artifacts from mlflow
    """
        local_dir = os.getcwd()
        if not os.path.exists(local_dir):
            os.mkdir(local_dir)
        local_path = self.mlflowClient.download_artifacts(
            self.run_id, "features", local_dir)
        """
    Confirm downloaded files
    """
        print("Artifacts downloaded in: {}".format(local_path))
        print("Artifacts: {}".format(os.listdir(local_path)))
        """
    Load model yaml from mode features directory
    """
        model_yaml_dir_path_caps = local_path + '/' + str(
            self.model_yaml_file_caps_case).strip()
        model_yaml_dir_path_small = local_path + '/' + str(
            self.model_yaml_file_small_case).strip()
        self.model_y_test_file = local_path + '/' + self.model_y_test_file_name
        self.features_local_path = local_path

        if os.path.exists(model_yaml_dir_path_caps):
            expt_conf = du.yaml.load(
                open(model_yaml_dir_path_caps).read().format(
                    DATA_DIR=local_path),
                Loader=du._Loader)
        elif os.path.exists(model_yaml_dir_path_small):
            expt_conf = du.yaml.load(
                open(model_yaml_dir_path_small).read().format(
                    DATA_DIR=local_path),
                Loader=du._Loader)
        """
      Load model data
    """
        preprocessor = StandardScaler()
        train_filter = [
            filter_preprocessor(cols=expt_conf['numerical'],
                                preprocessor=preprocessor,
                                refit=True),
            filter_fillna(fill_value=expt_conf['normal_values'],
                          time_order_col=expt_conf['time_order_col'])
        ]
        transform = ptd.transform_drop_cols(
            cols_to_drop=expt_conf['time_order_col'])

        self.model_dataset = ptd.BaseDataset(
            tgt_file=expt_conf['test']['tgt_file'],
            feat_file=expt_conf['test']['feat_file'],
            idx_col=expt_conf['idx_cols'],
            tgt_col=expt_conf['tgt_col'],
            feat_columns=expt_conf['feat_cols'],
            time_order_col=expt_conf['time_order_col'],
            category_map=expt_conf['category_map'],
            transform=transform,
            filter=train_filter,
        )

        super(ModeWrapperApp, self).run(host=host,
                                        port=port,
                                        debug=debug,
                                        load_dotenv=load_dotenv,
                                        **options)
Esempio n. 19
0
def main():
    print(os.environ)
    print('MODEL VERSION: ' + os.environ.get('MODEL_VERSION'))
    client = MlflowClient()
    client.download_artifacts(os.environ.get('MODEL_VERSION'), "model", '.')
Esempio n. 20
0
run_id = os.getenv("MLFLOW_RUN_ID")

if run_id is None:
    print("Loading model and histogram from local files")
    model_file = "./model.pkl"
    histogram_file = "./histogram.prom"

else:
    print("Loading model and histogram from MLflow")
    from mlflow.tracking import MlflowClient

    client = MlflowClient()
    local_dir = "/tmp/artifact_downloads"
    if not os.path.exists(local_dir):
        os.mkdir(local_dir)
    local_path = client.download_artifacts(run_id, "", local_dir)
    print("Artifacts downloaded in: {}".format(local_path))
    print("Artifacts: {}".format(os.listdir(local_path)))

    model_file = f"{local_path}/model/model.pkl"
    histogram_file = f"{local_path}/histogram.txt"

with open(model_file, "rb") as f:
    model = pickle.load(f)

monitor = ModelMonitoringService(baseline_collector=BaselineMetricCollector(
    path=histogram_file))

app = Flask(__name__)

Esempio n. 21
0
import os
import uuid
import shutil
import pandas as pd

from mlflow.tracking import MlflowClient

# Download input data from mlflow into a pandas DataFrame
# create temp directory to download data
temp_dir = os.path.join(os.environ["SPARK_LOCAL_DIRS"], str(uuid.uuid4())[:8])
os.makedirs(temp_dir)

# download the artifact and read it
client = MlflowClient()
training_data_path = client.download_artifacts(
    "0df1633dc8eb4dc0ac25655e9c851687", "data", temp_dir)
df = pd.read_parquet(os.path.join(training_data_path, "training_data"))

# delete the temp data
shutil.rmtree(temp_dir)

target_col = "tipped"

# COMMAND ----------

# MAGIC %md
# MAGIC ## Profiling Results

# COMMAND ----------

from pandas_profiling import ProfileReport