Beispiel #1
0
 def __init__(self, url, experiment):
     self.url = url
     self.experiment = experiment
     mlflow.set_tracking_uri(self.url)
     mlflow.set_experiment(self.experiment)
Beispiel #2
0
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("kyle-test")

df = pd.read_csv("kc_house_data.csv") 

# choose features
features = ["bedrooms","bathrooms","sqft_living","sqft_above","grade",
            "floors","view",'sqft_lot','floors','waterfront','zipcode'] 

# getting those features from the dataframe
x = df[features]
y = df["price"]

# splits data into 80% train 20% test
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, 
                                                    random_state=3)

# choose model with settings
model = RandomForestRegressor(n_estimators=100) 
model.fit(x_train, y_train)

# define and print
metrics = {"train_score": model.score(x_train, y_train), 
"test_score": model.score(x_test, y_test)}
print(metrics)
import mlflow

from cls.rfr_model import RFRModel
from cls.utils import Utils

if __name__ == "__main__":
   # Use sqlite:///mlruns.db as the local store for tracking and registery
   mlflow.set_tracking_uri("sqlite:///mlruns.db")

   # load and print dataset
   csv_path = "data/windfarm_data.csv"
   wind_farm_data = Utils.load_data(csv_path, index_col=0)
   Utils.print_pandas_dataset(wind_farm_data)

   # Get Validation data
   X_train, y_train = Utils.get_training_data(wind_farm_data)
   val_x, val_y = Utils.get_validation_data(wind_farm_data)

   # train, fit and register our model
   params_list = [
      {"n_estimators": 100},
      {"n_estimators": 200},
      {"n_estimators": 300}]

   # Iterate over few different tuning parameters
   model_name = "SKLearnWeatherForestModel"
   for params in params_list:
      rfr = RFRModel.new_instance(params)
      print("Using paramerts={}".format(params))
      runID = rfr.mlflow_run(X_train, y_train, val_x, val_y, model_name)
      print("MLflow run_id={} completed with MSE={} and RMSE={}".format(runID, rfr.mse, rfr.rsme))
def run_experiments(*,
                    data_file_path: str = None,
                    ground_truth_path: str = None,
                    train_size: int,
                    val_size: float = 0.1,
                    sub_test_size: int,
                    channels_idx: int = 0,
                    neighborhood_size: int = None,
                    save_data: bool = False,
                    n_runs: int = 1,
                    dest_path: str,
                    models_path: str,
                    model_name: str,
                    n_classes: int,
                    use_ensemble: bool = False,
                    ensemble_copies: int = None,
                    voting: str = 'mean',
                    voting_model: str = None,
                    voting_model_params: str = None,
                    batch_size: int = 256,
                    noise_params: str = None,
                    endmembers_path: str = None,
                    use_mlflow: bool = False,
                    experiment_name: str = None,
                    model_exp_name: str = None,
                    run_name: str = None):
    """
    Function for running the inference for the unmixing problem
    given a set of hyperparameters.

    :param data_file_path: Path to the data file. It should be a numpy array.
    :param ground_truth_path: Path to the ground-truth data file.
        It should be a numpy array.
    :param train_size: If float, should be between 0.0 and 1.0,
        if int, it represents number of samples to draw from data.
    :param val_size: Should be between 0.0 and 1.0. Represents the
        percentage of samples to extract from the training set.
    :param sub_test_size: Number of pixels to subsample the test set
        instead of performing the inference on the entire subset.
    :param channels_idx: Index specifying the channels
        position in the provided data.
    :param neighborhood_size: Size of the spatial patch.
    :param save_data: Boolean indicating whether to save the prepared dataset.
    :param n_runs: Number of total experiment runs.
    :param dest_path: Path to the directory where all experiment runs
        will be saved as subdirectories.
    :param models_path: Path to the directory where the previously trained
        models are stored.
    :param model_name: Name of the model, it serves as a key in the
        dictionary holding all functions returning models.
    :param n_classes: Number of classes.
    :param use_ensemble: Boolean indicating whether to use the
        ensemble functionality for prediction.
    :param ensemble_copies: Number of model copies for the ensemble.
    :param voting: Method of ensemble voting. If 'booster',
        employs a new model, which is trained on the
        ensemble predictions on the training set. Else if 'mean', averages
        the predictions of all models, without any weights.
    :param voting_model: Type of the model to use when the voting
        argument is set to 'booster'. This indicates, that a new model
        is trained on the ensemble's predictions on the learning set,
        to leverage the quality of the regression. Supported models are:
        SVR (support vector machine for regression), RFR (random forest
        for regression) and DTR (decision tree for regression).
    :param voting_model_params: Parameters of the voting model.
        Used only when the type of voting is set to 'booster'.
        Should be specified analogously to the noise injection parameters
        in the 'noise' module.
    :param batch_size: Size of the batch used in training phase,
        it is the number of samples to utilize per single gradient step.
    :param noise_params: Parameters for the noise when creating
        copies of the base model. Those can be for instance the mean,
        or standard deviation of the noise.
        For the details see the 'noise' module.
        Exemplary value for this parameter is "{"mean": 0, "std": 1}".
    :param endmembers_path: Path to the endmembers file containing
        the average reflectances for each class. Used only when
        'use_unmixing' is set to True.
    :param use_mlflow: Boolean indicating whether to log metrics
        and artifacts to mlflow.
    :param experiment_name: Name of the experiment. Used only if
        'use_mlflow' is set to True.
    :param model_exp_name: Name of the experiment. Used only if
        'use_mlflow' is set to True.
    :param run_name: Name of the run. Used only if 'use_mlflow' is set to True.
    """
    if use_mlflow:
        args = locals()
        mlflow.set_tracking_uri("http://beetle.mlflow.kplabs.pl")
        mlflow.set_experiment(experiment_name)
        mlflow.start_run(run_name=run_name)
        log_params_to_mlflow(args)
        log_tags_to_mlflow(args['run_name'])
        models_path = get_mlflow_artifacts_path(models_path, model_exp_name)

    for experiment_id in range(n_runs):
        experiment_dest_path = os.path.join(
            dest_path, 'experiment_' + str(experiment_id))
        model_name_regex = re.compile('unmixing_.*')
        model_dir = os.path.join(models_path, f'experiment_{experiment_id}')
        model_name = list(filter(model_name_regex.match,
                                 os.listdir(model_dir)))[0]
        model_path = os.path.join(model_dir, model_name)

        os.makedirs(experiment_dest_path, exist_ok=True)

        data_source = prepare_data.main(data_file_path=data_file_path,
                                        ground_truth_path=ground_truth_path,
                                        train_size=train_size,
                                        val_size=val_size,
                                        stratified=False,
                                        background_label=-1,
                                        channels_idx=channels_idx,
                                        neighborhood_size=neighborhood_size,
                                        save_data=save_data,
                                        seed=experiment_id,
                                        use_unmixing=True)
        if sub_test_size is not None:
            subsample_test_set(data_source[enums.Dataset.TEST], sub_test_size)
        evaluate_unmixing.evaluate(
            model_path=model_path,
            data=data_source,
            dest_path=experiment_dest_path,
            use_ensemble=use_ensemble,
            ensemble_copies=ensemble_copies,
            endmembers_path=endmembers_path,
            voting=voting,
            voting_model=voting_model,
            noise_params=noise_params,
            batch_size=batch_size,
            seed=experiment_id,
            neighborhood_size=neighborhood_size,
            voting_model_params=voting_model_params)

        tf.keras.backend.clear_session()

    artifacts_reporter.collect_artifacts_report(experiments_path=dest_path,
                                                dest_path=dest_path,
                                                use_mlflow=use_mlflow)

    if use_mlflow:
        mlflow.set_experiment(experiment_name)
        mlflow.log_artifacts(dest_path, artifact_path=dest_path)
        shutil.rmtree(dest_path)
import mlflow
import shutil
import inspect
import collections
from conf import Config
from pathlib import Path
from src.data_connectors import PandasFileConnector

mlflow.set_tracking_uri(
    Config.MLFLOW["TRACKING_URI"])  # Setting location to save models
mlflow.set_experiment(Config.MLFLOW["EXPERIMENT_NAME"])


class MLFlowLogger:
    @classmethod
    def log(cls, post_process_output):

        with mlflow.start_run():
            cls.__log_config()
            cls.__log_opt_model(post_process_output)

    @classmethod
    def __log_opt_model(cls, post_process_output):

        artifact_folder = Config.MLFLOW['TEMP_ARTIFACT_DIR']
        Path(artifact_folder).mkdir(parents=True, exist_ok=True)

        # Solver Results
        post_process_output.solver_results.results.write(filename=str(
            Path(artifact_folder, 'solver_results.json')),
                                                         format='json')
Beispiel #6
0
# In[26]:

yPredict = gnb.predict(dfToPredict)
print('La classe predite est : ', yPredict)

# # Integration de MLFlow

# In[27]:

import mlflow
import mlflow.sklearn

# In[28]:

#mlflow.set_experiment(experiment_name='Examen_A57')
mlflow.set_tracking_uri("http://benmassaoud.com:5000")

# In[29]:

with mlflow.start_run():

    mlflow.log_metric("recall_score_test", recall_score_test)
    mlflow.log_metric("f1_score_test", f1_score_test)
    mlflow.log_metric("accuracy_test", accuracy_test)
    mlflow.sklearn.log_model(gnb, "model")

# # Export des metriques

# In[32]:

with open("metrics.txt", 'w') as outfile:
Beispiel #7
0
def prepare_mlflow(params):
    mlflow.set_tracking_uri(params['tracking_uri'])
    mlflow.set_experiment(params['experiment'])
Beispiel #8
0
 def __enter__(self):
     if USE_MLFLOW:
         mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URL)
         mlflow.set_experiment(TENANT)
         mlflow.start_run(run_name=RUN_LABEL)
     return self
from datetime import date

import mlflow
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

mlflow_settings = dict(
    username="******",
    password="******",
    host="127.0.0.1",
    port=5000,
)

mlflow.set_tracking_uri(
    "http://{username}:{password}@{host}:{port}".format(**mlflow_settings))

current_date = date.today()
experiment_id = mlflow.set_experiment("Web Traffic Forecast")


def prepare_data(df):
    df["ds"] = pd.to_datetime(df["ds"])
    df['weekday'] = df['ds'].apply(lambda x: x.weekday())
    df['year'] = df.ds.dt.year
    df['month'] = df.ds.dt.month
    df['day'] = df.ds.dt.day

    X = df.set_index("ds").drop(columns=["y"], errors="ignore")

    return X
Beispiel #10
0
def http_tracking_uri_mock():
    mlflow.set_tracking_uri("http://some-cool-uri")
    yield
    mlflow.set_tracking_uri(None)
Beispiel #11
0
        return [s]

    def predict(self, context, model_input):
        model_input[['name']] = model_input.apply(self.summarize_article)

        return model_input


# Input and Output formats
input = json.dumps([{'name': 'text', 'type': 'string'}])
output = json.dumps([{'name': 'text', 'type': 'string'}])
# Load model from spec
signature = ModelSignature.from_dict({'inputs': input, 'outputs': output})

#MLFlow Operations
mlflow.set_tracking_uri("")
tracking_uri = mlflow.get_tracking_uri()
print("Current tracking uri: {}".format(tracking_uri))

# Start tracking
with mlflow.start_run(run_name="hf_summarizer") as run:
    print(run.info.run_id)
    runner = run.info.run_id
    print("mlflow models serve -m runs:/" + run.info.run_id +
          "/model --no-conda")
    mlflow.pyfunc.log_model('model',
                            loader_module=None,
                            data_path=None,
                            code_path=None,
                            conda_env=None,
                            python_model=Summarizer(),
Beispiel #12
0
 def mlflow_client(self):
     mlflow.set_tracking_uri(MLFLOW_URI)
     return MlflowClient()
Beispiel #13
0
if __name__ == '__main__':
    parser = ArgumentParser(description="Training of Sentence VAE")
    parser.add_argument("--config", type=str, required=True, metavar='PATH',
                        help="Path to a configuration file.")
    parser.add_argument("--hyper-parameters", type=str, metavar='PATH',
                        help="Path to a hyper parameters file.")
    parser.add_argument("--run-dir", type=str, required=True, metavar='PATH',
                        help="Path to a directory where model checkpoints will be stored.")
    parser.add_argument("--force", action='store_true',
                        help="Whether to rewrite data if run directory already exists.")
    parser.add_argument("--experiment-name", type=str, metavar="ID",
                        help="Name of experiment if training process is run under mlflow")
    parser.add_argument("--verbose", action='store_true',
                        help="Verbosity of the training script.")
    args = parser.parse_args()

    if args.experiment_name is not None:
        if args.hyper_parameters is None:
            raise ValueError("You should provide hyper-parameters file to log into mlflow.")
        with open(args.hyper_parameters) as fp:
            h_params = json.load(fp)
        mlflow.set_tracking_uri(args.run_dir)
        mlflow_client = MlflowClient(args.run_dir)
        experiment_id = get_experiment_id(mlflow_client, args.experiment_name)
        tags = get_git_tags(Path.cwd())
        run_experiment(h_params, args.config, mlflow_client, experiment_id, tags=tags, verbose=args.verbose)
    else:
        params = json.loads(evaluate_file(args.config))
        train(args.run_dir, params, args.force, verbose=args.verbose)
Beispiel #14
0
def test_start_run(monkeypatch):

  _reset_experiment()

  with tempfile.TemporaryDirectory() as tmpdir:

    mlf.set_tracking_uri(f'file:{tmpdir}/foo')

    # no run should be active initially
    assert mlf.active_run() is None

    # test default args
    with uv.start_run() as r:
      active_run = mlf.active_run()
      assert active_run is not None
      assert active_run == r

    # test explicit experiment name, run name, artifact location
    cfg = {
        'experiment_name': 'experiment_0',
        'run_name': 'bar',
        'artifact_location': '/foo/bar',
    }

    with uv.start_run(**cfg) as r:
      active_run = mlf.active_run()
      assert active_run is not None
      assert active_run == r

      assert r.data.tags['mlflow.runName'] == cfg['run_name']
      assert mlf.get_experiment_by_name(cfg['experiment_name']) is not None
      assert mlf.get_artifact_uri().startswith(cfg['artifact_location'])

    # test env var experiment name, run name, path-based artifact location
    cfg = {
        'MLFLOW_EXPERIMENT_NAME': 'env_foo',
        'MLFLOW_RUN_NAME': 'env_bar',
        'MLFLOW_ARTIFACT_ROOT': '/tmp/foo/bar'
    }

    for k, v in cfg.items():
      monkeypatch.setenv(k, v)

    with uv.start_run() as r:
      active_run = mlf.active_run()
      assert active_run is not None
      assert active_run == r

      assert r.data.tags['mlflow.runName'] == cfg['MLFLOW_RUN_NAME']
      assert mlf.get_experiment_by_name(
          cfg['MLFLOW_EXPERIMENT_NAME']) is not None
      assert mlf.get_artifact_uri().startswith(cfg['MLFLOW_ARTIFACT_ROOT'])

    for k, v in cfg.items():
      monkeypatch.delenv(k)

    # test env var tags
    cfg = {
        'tag0': 'foo',
        'tag1': 'bar',
    }

    for k, v in cfg.items():
      monkeypatch.setenv(f'ENVVAR_{k}', v)

    with uv.start_run() as r:
      client = mlf.tracking.MlflowClient()
      tags = client.get_run(r.info.run_id).data.tags
      for k, v in cfg.items():
        assert k in tags, pp.pformat(tags)
        assert tags[k] == v, pp.pformat(tags)

    for k in cfg:
      monkeypatch.delenv(f'ENVVAR_{k}')

    # test CAIP tags
    monkeypatch.setenv('CLOUD_ML_JOB_ID', 'foo_cloud_job')

    with uv.start_run() as r:
      client = mlf.tracking.MlflowClient()
      tags = client.get_run(r.info.run_id).data.tags
      assert 'cloud_ml_job_details' in tags, pp.pformat(tags)
      assert tags['cloud_ml_job_details'] == (
          'https://console.cloud.google.com/ai-platform/jobs/foo_cloud_job')

      assert 'cloud_ml_job_id' in tags, pp.pformat(tags)
      assert tags['cloud_ml_job_id'] == 'foo_cloud_job'

    monkeypatch.delenv('CLOUD_ML_JOB_ID')

    # test case where no gcp project is set with gcs artifact store
    def mock_default(scopes=None, request=None, quota_project_id=None):
      return (google.auth.credentials.AnonymousCredentials(), None)

    monkeypatch.setattr('google.auth.default', mock_default)

    cfg = {
        'experiment_name': 'experiment_1',
        'run_name': 'bar',
        'artifact_location': 'gs://foo/bar',
    }

    with uv.start_run(**cfg) as r:
      active_run = mlf.active_run()
      assert active_run is not None
      assert active_run == r

      assert r.data.tags['mlflow.runName'] == cfg['run_name']
      assert mlf.get_experiment_by_name(cfg['experiment_name']) is not None
      assert mlf.get_artifact_uri().startswith(
          cfg['artifact_location']), mlf.get_artifact_uri()
      assert os.environ.get('GOOGLE_CLOUD_PROJECT') is not None

    # test case where gcp project is set with gcs artifact storage
    def mock_default(scopes=None, request=None, quota_project_id=None):
      return (google.auth.credentials.AnonymousCredentials(), 'test_project')

    monkeypatch.setattr('google.auth.default', mock_default)

    cfg = {
        'experiment_name': 'experiment_2',
        'run_name': 'bar',
        'artifact_location': 'gs://foo/bar',
    }

    with uv.start_run(**cfg) as r:
      active_run = mlf.active_run()
      assert active_run is not None
      assert active_run == r

      assert r.data.tags['mlflow.runName'] == cfg['run_name']
      assert mlf.get_experiment_by_name(cfg['experiment_name']) is not None
      assert mlf.get_artifact_uri().startswith(
          cfg['artifact_location']), mlf.get_artifact_uri()

    # test using existing experiment with different artifact location
    #   - this should use original artifact location
    cfg = {
        'experiment_name': 'experiment_2',
        'run_name': 'bar2',
        'artifact_location': '/a/b/c',
    }

    with uv.start_run(**cfg) as r:
      active_run = mlf.active_run()
      assert active_run is not None
      assert active_run == r

      assert r.data.tags['mlflow.runName'] == cfg['run_name']
      assert mlf.get_experiment_by_name(cfg['experiment_name']) is not None
      assert not mlf.get_artifact_uri().startswith(
          cfg['artifact_location']), mlf.get_artifact_uri()
Beispiel #15
0
def main(args):
    def do_eda(args):
        show_ner_datainfo(ner_labels, train_data_generator, args.train_file,
                          test_data_generator, args.test_file)

    def do_submit(args):
        generate_submission(args)

    if args.do_eda:
        do_eda(args)

    elif args.do_submit:
        do_submit(args)

    elif args.to_train_poplar:
        from theta.modeling import to_train_poplar
        to_train_poplar(args,
                        train_data_generator,
                        ner_labels=ner_labels,
                        ner_connections=[],
                        start_page=args.start_page,
                        max_pages=args.max_pages)

    elif args.to_reviews_poplar:
        from theta.modeling import to_reviews_poplar
        to_reviews_poplar(args,
                          ner_labels=ner_labels,
                          ner_connections=[],
                          start_page=args.start_page,
                          max_pages=args.max_pages)
    else:
        # -------------------- Model --------------------
        if args.ner_type == 'span':
            from theta.modeling.ner_span import NerTrainer
        else:
            from theta.modeling.ner import NerTrainer

        class AppTrainer(NerTrainer):
            def __init__(self, args, ner_labels):
                super(AppTrainer, self).__init__(args,
                                                 ner_labels,
                                                 build_model=None)

            #  def on_predict_end(self, args, test_dataset):
            #      super(Trainer, self).on_predict_end(args, test_dataset)

        trainer = AppTrainer(args, ner_labels)

        def do_train(args):
            train_examples, val_examples = load_train_val_examples(args)
            trainer.train(args, train_examples, val_examples)

        def do_eval(args):
            args.model_path = args.best_model_path
            _, eval_examples = load_train_val_examples(args)
            model = load_model(args)
            trainer.evaluate(args, model, eval_examples)

        def do_predict(args):
            args.model_path = args.best_model_path
            test_examples = load_test_examples(args)
            model = load_model(args)
            trainer.predict(args, model, test_examples)
            reviews_file, category_mentions_file = save_ner_preds(
                args, trainer.pred_results, test_examples)
            return reviews_file, category_mentions_file

        if args.do_train:
            do_train(args)

        elif args.do_eval:
            do_eval(args)

        elif args.do_predict:
            do_predict(args)

        elif args.do_experiment:
            if args.tracking_uri:
                mlflow.set_tracking_uri(args.tracking_uri)
            mlflow.set_experiment(args.experiment_name)

            with mlflow.start_run(run_name=f"{args.local_id}") as mlrun:
                log_global_params(args, experiment_params)

                # ----- Train -----
                do_train(args)

                # ----- Predict -----
                do_predict(args)

                # ----- Submit -----
                do_submit(args)
Beispiel #16
0
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet

import mlflow
import mlflow.sklearn


def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2


if __name__ == "__main__":
    mlflow.set_tracking_uri("http://mlflow.bayescluster.com")
    warnings.filterwarnings("ignore")
    np.random.seed(40)

    # Read the wine-quality csv file (make sure you're running this from the root of MLflow!)
    wine_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             "wine-quality.csv")
    data = pd.read_csv(wine_path)

    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)

    # The predicted column is "quality" which is a scalar from [3, 9]
    train_x = train.drop(["quality"], axis=1)
    test_x = test.drop(["quality"], axis=1)
    train_y = train[["quality"]]
from pathlib import Path
from azureml.core import Workspace

# get workspace
ws = Workspace.from_config()

# get root of git repo
prefix = Path(__file__).parent.parent.parent.absolute()

# project settings
project_uri = prefix.joinpath("mlprojects", "sklearn-diabetes")

# azure ml settings
experiment_name = "sklearn-diabetes-mlproject-example"
compute_name = "cpu-cluster"

# setup mlflow tracking
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
mlflow.set_experiment(experiment_name)

# setup backend config
backend_config = {"COMPUTE": compute_name}

# run mlflow project
run = mlflow.projects.run(
    uri=str(project_uri),
    parameters={"alpha": 0.3},
    backend="azureml",
    backend_config=backend_config,
)
Beispiel #18
0

def format_layers(config):
    formatted_input_layer = [format_layer("input", config['input'])]
    formatted_hidden_layers = reduce(format_hidden_layer, config['stacks'], {
        'index': 1,
        'stacks': []
    })
    formatted_output_layer = [format_layer("output", config['output'])]
    return formatted_input_layer + formatted_hidden_layers[
        'stacks'] + formatted_output_layer


if __name__ == '__main__':
    LOG.info('Start connecting to mlFlow instance')
    mlflow.set_tracking_uri(os.environ['MLFLOW_TRACKING_URI'])
    mlflow.set_experiment(EXPERIMENT_NAME)
    mlflow.start_run(run_name=JOB)
    LOG.info('Done connecting to mlFlow instance')

    try:
        LOG.info('Start loading datasets')

        LOG.info('Start downloading datasets')
        datalake = os.environ['DATALAKE'].replace('s3://', '')
        mlflow.log_param('input', os.environ['DATALAKE'] + '/pinkman/')
        datasets.download_s3_folder(datalake, 'pinkman/dictionary.csv',
                                    'dictionary')
        datasets.download_s3_folder(datalake, 'pinkman/test.csv', './test')
        datasets.download_s3_folder(datalake, 'pinkman/train.csv', './train')
        LOG.info('Done downloading datasets')
Beispiel #19
0
import os
import json
import amphora_client
import mlflow
import time
from datetime import datetime

from src.mapping import water_save, water_load
from src.sites import site_info
from src.signals import signals
from src.upload_signals import create_or_update_amphorae, upload_signals_to_amphora

## Set up log metrics
start = time.time()
sep = '_'
mlflow.set_tracking_uri(
    "http://aci-mlflow-dns.australiaeast.azurecontainer.io:5000/")
runName = sep.join(['Job_at', str(datetime.utcnow())])
mlflow.start_run(experiment_id=1, run_name=runName)
mlflow.log_metric("time_to_complete", 0)
mlflow.log_metric("sites_analysed", 0)
mlflow.log_metric("run_complete", 0)

sites = site_info()
water_locations = dict()
location_infos = dict()
# check we have all the amphora we need
for key, value in sites.items():
    store = dict()
    location_info = dict()

    code = key
Beispiel #20
0
def search_plasticc(sim_sn_path, training_cosmos_path, test_cosmos_path,
                    model_dir, batch_size, optimizer, adabound_gamma,
                    adabound_final_lr, lr, seed, epochs, patience, n_trials,
                    norm, flux_err, input1, input2, mixup, threads,
                    eval_frequency, binary,
                    mixup_alpha, mixup_beta):
    storage = 'sqlite:///{}/example.db'.format(model_dir)

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    if platform.system() == 'Windows':
        tmp = (Path(__file__).parents[1] / 'mlruns' /
               'search-plasticc-classification' / 'mlruns')
        uri = str(tmp.absolute().as_uri())
        # uri = 'file://' + str(tmp.absolute())
    else:
        tmp = (Path(__file__).parents[1] / 'mlruns' /
               'search-plasticc-classification' / 'mlruns')
        uri = str(tmp.absolute().as_uri())
    mlflow.set_tracking_uri(uri)

    n_classes = 2 if binary else 3
    name = '{n_classes}-{input1}-{input2}'.format(
        n_classes=n_classes, input1=input1, input2=input2
    )
    mlflow.set_experiment(name)

    db_path = os.path.join(model_dir, 'example.db')
    sampler = MyTPESampler()
    if os.path.exists(db_path):
        study = optuna.Study(study_name='study190513', storage=storage,
                             sampler=sampler)
    else:
        study = optuna.create_study(study_name='study190513', storage=storage,
                                    sampler=sampler)

    input_setting = InputSetting(
        batch_size=batch_size, mixup=mixup, mixup_alpha=mixup_alpha,
        mixup_beta=mixup_beta, balance=False
    )
    input_data = InputData(
        training_data=None, validation_data=None, test_data=None,
        mean=None, std=None, input1=input1, input2=input2, remove_y=False,
        is_hsc=False, n_classes=n_classes, input_setting=input_setting
    )

    optimizer_setting = OptimizerSetting(
        name=optimizer, lr=lr, gamma=adabound_gamma,
        final_lr=adabound_final_lr
    )
    loop_setting = LoopSetting(epochs=epochs, patience=patience,
                               eval_frequency=eval_frequency,
                               end_by_epochs=False)

    print('loading data')
    # 今までのflux_errなら1, 新しいflux_errなら2
    sim_sn, training_cosmos, _ = load_plasticc_data(
        sim_sn_path=sim_sn_path, training_cosmos_path=training_cosmos_path,
        test_cosmos_path=test_cosmos_path, use_flux_err2=flux_err == 2
    )
    sim_sn = sklearn.utils.shuffle(sim_sn, random_state=seed)
    training_cosmos = sklearn.utils.shuffle(training_cosmos,
                                            random_state=seed + 1)
    for data in (sim_sn, training_cosmos):
        for key in ('flux', 'flux_err'):
            tmp = data[key]
            data[key][np.isnan(tmp)] = 0

    # クラスラベルを数字にする
    label_map = get_label_map(binary=binary)
    sim_sn_y = np.array([label_map[c] for c in sim_sn['sn_type']])
    training_cosmos_y = np.array([label_map[c]
                                  for c in training_cosmos['sn_type']])

    sim_x1, sim_x2, sim_y1, sim_y2 = train_test_split(
        sim_sn, sim_sn_y, test_size=0.3, random_state=42, stratify=sim_sn_y
    )
    cosmos_x1, cosmos_x2, cosmos_y1, cosmos_y2 = train_test_split(
        training_cosmos, training_cosmos_y, test_size=0.3, random_state=43,
        stratify=training_cosmos_y
    )

    sim_dev_x, sim_val_x, sim_dev_y, sim_val_y = train_test_split(
        sim_x1, sim_y1, test_size=0.3, random_state=44, stratify=sim_y1
    )
    cosmos_dev_x, cosmos_val_x, cosmos_dev_y, cosmos_val_y = train_test_split(
        cosmos_x1, cosmos_y1, test_size=0.3, random_state=45,
        stratify=cosmos_y1
    )

    weight = np.asarray([0.9 / len(sim_dev_y)] * len(sim_dev_y) +
                        [0.1 / len(cosmos_dev_y)] * len(cosmos_dev_y))
    training_data = Data(x=np.hstack([sim_dev_x, cosmos_dev_x]),
                         y=np.hstack([sim_dev_y, cosmos_dev_y]),
                         weight=weight)
    validation_data = Data(x=np.hstack([sim_val_x, cosmos_val_x]),
                           y=np.hstack([sim_val_y, cosmos_val_y]))
    test_data = Data(x=np.hstack([sim_x2, cosmos_x2]),
                     y=np.hstack([sim_y2, cosmos_y2]))
    input_data.training_data = training_data
    input_data.validation_data = validation_data
    input_data.test_data = test_data

    mean, std = compute_moments(
        train_data=training_data.x, input1=input1, input2=input2, norm=norm,
        use_redshift=False, is_hsc=False, threads=threads
    )
    input_data.mean, input_data.std = mean, std

    for i in range(n_trials):
        study.optimize(
            lambda trial: objective_plasticc(
                trial=trial, input_data=input_data,
                optimizer_setting=optimizer_setting, seed=seed,
                loop_setting=loop_setting, normalization=norm,
                threads=threads, binary=binary, sim_sn_path=sim_sn_path,
                training_cosmos_path=training_cosmos_path, flux_err=flux_err
            ),
            n_trials=1
        )

        df = study.trials_dataframe()
        df.to_csv(os.path.join(model_dir, 'result.csv'))
Beispiel #21
0
import mlflow
from mlflow.tracking._tracking_service import utils
import os

if __name__ == "__main__":

    mlflow.set_tracking_uri('databricks')
    # Note: get_host_creds will be undefined if not logging to a remote tracking server, e.g. if logging to the local filesystem
    host_creds = utils._get_store().get_host_creds()
    token = host_creds.token
    host = host_creds.host
    print(host, token, os.getgid())

Beispiel #22
0
def search_hsc(sim_sn_path, hsc_path, model_dir, batch_size, optimizer,
               adabound_gamma, adabound_final_lr, lr, seed, epochs, patience,
               n_trials, norm, input1, input2,
               mixup, threads, eval_frequency, binary, task_name, remove_y,
               mixup_alpha, mixup_beta):
    storage = 'sqlite:///{}/example.db'.format(model_dir)

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    if platform.system() == 'Windows':
        tmp = (Path(__file__).parents[1] / 'mlruns' /
               'search-hsc-classification' / 'mlruns')
        uri = str(tmp.absolute().as_uri())
        # uri = 'file://' + str(tmp.absolute())
    else:
        tmp = (Path(__file__).absolute().parents[1] / 'mlruns' /
               'search-hsc-classification' / 'mlruns')
        uri = str(tmp.absolute().as_uri())
    mlflow.set_tracking_uri(uri)
    mlflow.set_tracking_uri(uri)

    n_classes = 2 if binary else 3
    name = '{n_classes}-{task_name}-{input1}-{input2}'.format(
        n_classes=n_classes, task_name=task_name, input1=input1, input2=input2
    )
    if remove_y:
        name += '-remove-y'
    mlflow.set_experiment(name)

    print(model_dir)
    db_path = os.path.join(model_dir, 'example.db')
    sampler = MyTPESampler()
    if os.path.exists(db_path):
        study = optuna.Study(study_name='study190513', storage=storage,
                             sampler=sampler)
    else:
        study = optuna.create_study(study_name='study190513', storage=storage,
                                    sampler=sampler)

    input_setting = InputSetting(
        batch_size=batch_size, mixup=mixup,
        mixup_alpha=mixup_alpha, mixup_beta=mixup_beta
    )
    input_data = InputData(
        training_data=None, validation_data=None, test_data=None,
        mean=None, std=None, input1=input1, input2=input2,
        remove_y=remove_y, is_hsc=True, n_classes=n_classes,
        input_setting=input_setting
    )

    optimizer_setting = OptimizerSetting(
        name=optimizer, lr=lr, gamma=adabound_gamma,
        final_lr=adabound_final_lr
    )
    loop_setting = LoopSetting(epochs=epochs, patience=patience,
                               eval_frequency=eval_frequency,
                               end_by_epochs=False)
    print('loading data')
    sim_sn, _ = load_hsc_data(
        sim_sn_path=sim_sn_path, hsc_path=hsc_path,
        remove_y=input_data.remove_y
    )
    sim_sn = sklearn.utils.shuffle(sim_sn, random_state=seed)

    # クラスラベルを数字にする
    label_map = get_label_map(binary=binary)
    sim_sn_y = np.array([label_map[c] for c in sim_sn['sn_type']])

    sim_x1, sim_x2, sim_y1, sim_y2 = train_test_split(
        sim_sn, sim_sn_y, test_size=0.3, random_state=42, stratify=sim_sn_y
    )
    sim_dev_x, sim_val_x, sim_dev_y, sim_val_y = train_test_split(
        sim_x1, sim_y1, test_size=0.3, random_state=44, stratify=sim_y1
    )

    training_data = Data(x=sim_dev_x, y=sim_dev_y)
    validation_data = Data(x=sim_val_x, y=sim_val_y)
    test_data = Data(x=sim_x2, y=sim_y2)
    input_data.training_data = training_data
    input_data.validation_data = validation_data
    input_data.test_data = test_data

    mean, std = compute_moments(
        train_data=training_data.x, input1=input1, input2=input2, norm=norm,
        use_redshift=False, is_hsc=True, threads=threads
    )
    input_data.mean, input_data.std = mean, std

    for i in range(n_trials):
        study.optimize(
            lambda trial: objective_hsc(
                trial=trial, sim_sn_path=sim_sn_path, hsc_path=hsc_path,
                optimizer_setting=optimizer_setting, seed=seed,
                loop_setting=loop_setting, normalization=norm,
                threads=threads, binary=binary, input_data=input_data
            ),
            n_trials=1
        )

        df = study.trials_dataframe()
        df.to_csv(os.path.join(model_dir, 'result.csv'))
Beispiel #23
0
def run_experiments(*,
                    data_file_path: str,
                    ground_truth_path: str = None,
                    train_size: ('train_size', multi(min=0)),
                    val_size: float = 0.1,
                    stratified: bool = True,
                    background_label: int = 0,
                    channels_idx: int = 0,
                    n_runs: int,
                    model_name: str,
                    kernel_size: int = 3,
                    n_kernels: int = 16,
                    save_data: bool = 0,
                    n_layers: int = 1,
                    dest_path: str = None,
                    sample_size: int,
                    n_classes: int,
                    lr: float = 0.005,
                    batch_size: int = 150,
                    epochs: int = 10,
                    verbose: int = 2,
                    shuffle: bool = True,
                    patience: int = 3,
                    pre_noise: ('pre', multi(min=0)),
                    pre_noise_sets: ('spre', multi(min=0)),
                    post_noise: ('post', multi(min=0)),
                    post_noise_sets: ('spost', multi(min=0)),
                    noise_params: str = None,
                    use_mlflow: bool = False,
                    experiment_name: str = None,
                    run_name: str = None):
    """
    Function for running experiments given a set of hyper parameters.
    :param data_file_path: Path to the data file. Supported types are: .npy
    :param ground_truth_path: Path to the ground-truth data file.
    :param train_size: If float, should be between 0.0 and 1.0,
                        if stratified = True, it represents percentage of each
                        class to be extracted,
                 If float and stratified = False, it represents percentage of the
                    whole dataset to be extracted with samples drawn randomly,
                    regardless of their class.
                 If int and stratified = True, it represents number of samples
                    to be drawn from each class.
                 If int and stratified = False, it represents overall number of
                    samples to be drawn regardless of their class, randomly.
                 Defaults to 0.8
    :param val_size: Should be between 0.0 and 1.0. Represents the percentage of
                     each class from the training set to be extracted as a
                     validation set, defaults to 0.1
    :param stratified: Indicated whether the extracted training set should be
                     stratified, defaults to True
    :param background_label: Label indicating the background in GT file
    :param channels_idx: Index specifying the channels position in the provided
                         data
    :param save_data: Whether to save the prepared dataset
    :param n_runs: Number of total experiment runs.
    :param model_name: Name of the model, it serves as a key in the
        dictionary holding all functions returning models.
    :param kernel_size: Size of ech kernel in each layer.
    :param n_kernels: Number of kernels in each layer.
    :param n_layers: Number of layers in the model.
    :param dest_path: Path to where all experiment runs will be saved as
        subfolders in this directory.
    :param sample_size: Size of the input sample.
    :param n_classes: Number of classes.
    :param lr: Learning rate for the model, i.e., regulates the size of the step
        in the gradient descent process.
    :param batch_size: Size of the batch used in training phase,
        it is the size of samples per gradient step.
    :param epochs: Number of epochs for model to train.
    :param verbose: Verbosity mode used in training, (0, 1 or 2).
    :param shuffle: Boolean indicating whether to shuffle dataset
     dataset_key each epoch.
    :param patience: Number of epochs without improvement in order to
        stop the training phase.
    :param pre_noise: The list of names of noise injection methods before
        the normalization transformations. Exemplary names are "gaussian"
        or "impulsive".
    :param pre_noise_sets: The list of sets to which the noise will be
        injected. One element can either be "train", "val" or "test".
    :param post_noise: The list of names of noise injection methods after
        the normalization transformations.
    :param post_noise_sets: The list of sets to which the noise will be injected.
    :param noise_params: JSON containing the parameter setting of injection methods.
        Exemplary value for this parameter: "{"mean": 0, "std": 1, "pa": 0.1}".
        This JSON should include all parameters for noise injection
        functions that are specified in pre_noise and post_noise arguments.
        For the accurate description of each parameter, please
        refer to the ml_intuition/data/noise.py module.
    :param use_mlflow: Whether to log metrics and artifacts to mlflow.
    :param experiment_name: Name of the experiment. Used only if
        use_mlflow = True
    :param run_name: Name of the run. Used only if use_mlflow = True.
    """
    train_size = parse_train_size(train_size)
    if use_mlflow:
        args = locals()
        mlflow.set_tracking_uri("http://beetle.mlflow.kplabs.pl")
        mlflow.set_experiment(experiment_name)
        mlflow.start_run(run_name=run_name)
        log_params_to_mlflow(args)
        log_tags_to_mlflow(args['run_name'])

    if dest_path is None:
        dest_path = os.path.join(os.path.curdir, "temp_artifacts")

    for experiment_id in range(n_runs):
        experiment_dest_path = os.path.join(
            dest_path, '{}_{}'.format(enums.Experiment.EXPERIMENT,
                                      str(experiment_id)))
        if save_data:
            data_source = os.path.join(experiment_dest_path, 'data.h5')
        else:
            data_source = None

        os.makedirs(experiment_dest_path, exist_ok=True)
        if data_file_path.endswith('.h5') and ground_truth_path is None:
            data = load_processed_h5(data_file_path=data_file_path)
        else:
            data = prepare_data.main(data_file_path=data_file_path,
                                     ground_truth_path=ground_truth_path,
                                     output_path=data_source,
                                     train_size=train_size,
                                     val_size=val_size,
                                     stratified=stratified,
                                     background_label=background_label,
                                     channels_idx=channels_idx,
                                     save_data=save_data,
                                     seed=experiment_id)
        if not save_data:
            data_source = data

        if len(pre_noise) > 0:
            noise.inject_noise(data_source=data_source,
                               affected_subsets=pre_noise_sets,
                               noise_injectors=pre_noise,
                               noise_params=noise_params)

        train_model.train(model_name=model_name,
                          kernel_size=kernel_size,
                          n_kernels=n_kernels,
                          n_layers=n_layers,
                          dest_path=experiment_dest_path,
                          data=data_source,
                          sample_size=sample_size,
                          n_classes=n_classes,
                          lr=lr,
                          batch_size=batch_size,
                          epochs=epochs,
                          verbose=verbose,
                          shuffle=shuffle,
                          patience=patience,
                          noise=post_noise,
                          noise_sets=pre_noise_sets,
                          noise_params=noise_params)

        evaluate_model.evaluate(model_path=os.path.join(
            experiment_dest_path, model_name),
                                data=data_source,
                                dest_path=experiment_dest_path,
                                n_classes=n_classes,
                                batch_size=batch_size,
                                noise=post_noise,
                                noise_sets=pre_noise_sets,
                                noise_params=noise_params)
        tf.keras.backend.clear_session()

    artifacts_reporter.collect_artifacts_report(experiments_path=dest_path,
                                                dest_path=dest_path,
                                                use_mlflow=use_mlflow)
    if enums.Splits.GRIDS in data_file_path:
        fair_report_path = os.path.join(dest_path,
                                        enums.Experiment.REPORT_FAIR)
        artifacts_reporter.collect_artifacts_report(
            experiments_path=dest_path,
            dest_path=fair_report_path,
            filename=enums.Experiment.INFERENCE_FAIR_METRICS,
            use_mlflow=use_mlflow)

    if use_mlflow:
        mlflow.log_artifacts(dest_path, artifact_path=dest_path)
        shutil.rmtree(dest_path)
Beispiel #24
0
    def log_cv(self, experiment, name, tracking_uri=None):
        """Logging of cross validation results to mlflow tracking server
        
        Args:
            experiment (str): experiment ID
            name (str): Name of the experiment artifact (prefix)
            tracking_uri (str, optional): URI of the tracking server. 
                                          Defaults to None, which will use remote 
                                          tracking in remote case
        """
        cv_results = self.results.cv_results_
        best = self.results.best_index_

        timestamp = datetime.datetime.now().isoformat().split(".")[0].replace(
            ":", ".")

        num_runs = len(cv_results["rank_test_score"])
        run_name = "run %d (best run of %d):" % (self.results.best_index_,
                                                 num_runs)

        if tracking_uri:
            mlflow.set_tracking_uri(tracking_uri)

        mlflow.set_experiment(experiment)

        with mlflow.start_run(run_name=run_name):  #  as run:

            mlflow.log_param("folds", self.results.cv)

            print("Logging parameters")
            params = list(self.results.param_grid.keys())
            for param in params:
                mlflow.log_param(param, cv_results["param_%s" % param][best])

            print("Logging metrics")
            mlflow.log_metric("mean_test_score",
                              cv_results["mean_test_score"][best])
            mlflow.log_metric("std_test_score",
                              cv_results["std_test_score"][best])

            print("Logging model")
            mlflow.sklearn.log_model(self.results.best_estimator_, "model")

            print("Logging CV results matrix")
            tempdir = tempfile.TemporaryDirectory().name
            os.mkdir(tempdir)
            filename = "%s-%s-cv_results.csv" % (name, timestamp)
            csv = os.path.join(tempdir, filename)
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                pd.DataFrame(cv_results).sort_values(
                    by="rank_test_score").to_csv(csv, index=False)

            mlflow.log_artifact(csv, "cv_results")

        client = MlflowClient()
        experiment_id = client.get_experiment_by_name(experiment).experiment_id

        if is_remote():
            if os.environ.get("DBJL_ORG", None) is None:
                display(
                    HTML(
                        "<a href=%s/#mlflow/experiments/%s>Goto experiment</a>"
                        % (os.environ["DBJL_HOST"], experiment_id)))
            else:
                display(
                    HTML(
                        "<a href=%s?o=%s#mlflow/experiments/%s>Goto experiment</a>"
                        % (os.environ["DBJL_HOST"], os.environ["DBJL_ORG"],
                           experiment_id)))
        else:
            display(
                HTML("<a href=%s/#/experiments/%s>Goto experiment</a>" %
                     (tracking_uri, experiment_id)))
import os
from random import random, randint

import mlflow
from mlflow import log_metric, log_param, log_artifacts

tracking_uri = 'file:///root/mlflow'
mlflow.set_tracking_uri(tracking_uri)

experiment_name = 'hello_world'
mlflow.set_experiment(experiment_name)

if __name__ == "__main__":
    print("Running mlflow_tracking.py")

    log_param("hyperparam1", randint(0, 100))

    log_metric("accuracy", random())
    log_metric("accuracy", random() + 1)
    log_metric("accuracy", random() + 2)

    if not os.path.exists("outputs"):
        os.makedirs("outputs")
    with open("outputs/model.txt", "w") as f:
        f.write("hello world!")

    log_artifacts("outputs")
 def setUp(self):
     TestCaseWithReset.setUp(self)
     TestCaseWithTempDir.setUp(self)
     if "MLFLOW_TRACKING_URI" in os.environ:
         del os.environ["MLFLOW_TRACKING_URI"]
     mlflow.set_tracking_uri(None)
parser.add_argument("dir", help="Directory")
parser.add_argument("pipeline", help="Pipeline Name")
parser.add_argument("--cluster-id", help="Cluster ID")
parser.add_argument("--new-cluster", help="Create new cluster", action="store_true")
args = parser.parse_args()
print(args)

if not isdir(args.dir):
    print('Please specify existing directory with pipelines! ', args.dir, ' directory does not exist.')
    sys.exit(-100)

if not isdir(args.dir+'/'+args.pipeline):
    print('Please specify existing pipeline name as --pipeline-name ',args.dir+'/'+args.pipeline,' drectory does not exist.')
    sys.exit(-100)

if args.cluster_id and args.new_cluster:
    print('create_cluster parameter is set to True and cluster_id is specified. Exiting...')
    sys.exit(-100)

if not (args.cluster_id) and not (args.new_cluster):
    print('create_cluster parameter is set to False and cluster_id is not specified. Exiting...')
    sys.exit(-100)

import mlflow
mlflow.set_tracking_uri("databricks")

from setuptools import sandbox
sandbox.run_setup('setup.py', ['clean', 'bdist_wheel'])

from databrickslabs_cicdtemplates import cluster_and_libraries
cluster_and_libraries.main(args.dir, args.pipeline, args.cluster_id)
Beispiel #28
0
import mlflow
import json
import os
from elasticsearch import Elasticsearch
from pipeline.util import get_or_create_experiment_id
from pipeline.util import MlflowReporter
from pyformance.registry import MetricsRegistry
from pyformance.reporters.influx import InfluxReporter

mlflow.set_tracking_uri("http://127.0.0.1:5000")


def fetch_docs(base_path, es_host, es_index, es_query=None, limit=-1):
    exp_name = "fetch_docs"
    exp_path = f"{base_path}/{exp_name}"
    os.makedirs(exp_path, exist_ok=True)

    run = mlflow.start_run(experiment_id=get_or_create_experiment_id(exp_name))
    docs_path = f"{exp_path}/{run.run_info.run_uuid}"

    registry = MetricsRegistry()

    mlflow_reporter = MlflowReporter(registry=registry,
                                     active_run=run,
                                     reporting_interval=10)
    mlflow_reporter.start()

    influx_reporter = InfluxReporter(registry=registry,
                                     reporting_interval=10,
                                     autocreate_database=True)
    influx_reporter.start()
def test_mlflow_hook_save_pipeline_ml_with_copy_mode(
    kedro_project_with_mlflow_conf,
    dummy_pipeline_ml,
    dummy_catalog,
    dummy_run_params,
    copy_mode,
    expected,
):
    # config_with_base_mlflow_conf is a conftest fixture
    bootstrap_project(kedro_project_with_mlflow_conf)
    with KedroSession.create(
            project_path=kedro_project_with_mlflow_conf) as session:
        context = session.load_context()
        mlflow_hook = MlflowHook()
        runner = SequentialRunner()
        mlflow_hook.after_context_created(context)
        mlflow_hook.after_catalog_created(
            catalog=dummy_catalog,
            # `after_catalog_created` is not using any of arguments bellow,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
        )

        pipeline_to_run = pipeline_ml_factory(
            training=dummy_pipeline_ml.training,
            inference=dummy_pipeline_ml.inference,
            input_name=dummy_pipeline_ml.input_name,
            log_model_kwargs={
                "artifact_path":
                dummy_pipeline_ml.log_model_kwargs["artifact_path"],
                "conda_env": {
                    "python": "3.7.0",
                    "dependencies": ["kedro==0.16.5"]
                },
            },
            kpm_kwargs={
                "copy_mode": copy_mode,
            },
        )
        mlflow_hook.before_pipeline_run(run_params=dummy_run_params,
                                        pipeline=pipeline_to_run,
                                        catalog=dummy_catalog)
        runner.run(pipeline_to_run, dummy_catalog, session._hook_manager)
        run_id = mlflow.active_run().info.run_id
        mlflow_hook.after_pipeline_run(run_params=dummy_run_params,
                                       pipeline=pipeline_to_run,
                                       catalog=dummy_catalog)

        mlflow_tracking_uri = (kedro_project_with_mlflow_conf /
                               "mlruns").as_uri()
        mlflow.set_tracking_uri(mlflow_tracking_uri)

        loaded_model = mlflow.pyfunc.load_model(
            model_uri=f"runs:/{run_id}/model")

        actual_copy_mode = {
            name: ds._copy_mode
            for name, ds in loaded_model._model_impl.python_model.
            loaded_catalog._data_sets.items()
        }

        assert actual_copy_mode == expected
import mlflow
import logging
from cortex.main import run

from src.models.fix_match.controller import FixMatchController
from src.data.dataset_plugins import SSLDatasetPlugin
from src import MLFLOW_SSL_URI

logger = logging.getLogger('ssl_evaluation')

if __name__ == '__main__':

    # if exp.ARGS
    mlflow.set_tracking_uri(MLFLOW_SSL_URI)
    controller = FixMatchController()

    run(model=controller)