Example #1
0
def test_run_with_artifact_path(tmpdir):
    artifact_file = tmpdir.join("model.pkl")
    artifact_file.write("Hello world")
    with kiwi.start_run() as run:
        kiwi.log_artifact(artifact_file)
        submitted_run = kiwi.projects.run(
            TEST_PROJECT_DIR,
            entry_point="test_artifact_path",
            parameters={"model": "runs:/%s/model.pkl" % run.info.run_id},
            use_conda=False,
            experiment_id=FileStore.DEFAULT_EXPERIMENT_ID)
        validate_exit_status(submitted_run.get_status(), RunStatus.FINISHED)
    def objective(args):

        # Define model
        model = RankedNetworkCNNModule(args['learning_rate'],
                                       dataset.get_embeddings(),
                                       hidden_dim=args['hidden'],
                                       output_labels=2)

        # Evaluation on held-out test-set
        with torch.no_grad():
            model.eval()
            results = pd.DataFrame(columns=['labels', 'predictions'])
            for batch_idx, batch in enumerate(test_loader):
                y_hat = model(batch['a'], batch['b'])

                results: pd.DataFrame = results.append(pd.DataFrame({
                    'labels':
                    batch['label'].flatten(),
                    'predictions':
                    y_hat.detach().argmax(axis=1)
                }),
                                                       ignore_index=True)
            results.to_csv()

            # With a nice confusion matrix
            confusion_matrix(y_pred=results['predictions'].values,
                             y_true=results['labels'].values,
                             classes=[0, 1])

            cm = ConfusionMatrix(actual_vector=results['labels'].values,
                                 predict_vector=results['predictions'].values)

            output_test_results = "cm.txt"
            cm.save_stat(output_test_results)

            output_test_predictions_file = "test_predictions.txt"
            np.savetxt(output_test_predictions_file,
                       results['predictions'].values,
                       delimiter=",")

            kiwi.log_metric(key="test_acc", value=cm.Overall_ACC)
            kiwi.log_metric(key="test_f1_micro", value=cm.F1_Micro)
            kiwi.log_metric(key="test_f1_macro", value=cm.F1_Macro)
            kiwi.log_metric(key="test_ci_pm",
                            value=cm.CI95[1] - cm.Overall_ACC)
            kiwi.log_metric(key="test_ci_pm",
                            value=cm.CI95[1] - cm.Overall_ACC)
            kiwi.log_artifact(output_test_predictions_file)
            kiwi.log_artifact(output_test_results + ".pycm")

            return cm.Overall_ACC
Example #3
0
def test_artifact_can_be_downloaded_from_absolute_uri_successfully(tmpdir):
    artifact_file_name = "artifact.txt"
    artifact_text = "Sample artifact text"
    local_artifact_path = tmpdir.join(artifact_file_name).strpath
    with open(local_artifact_path, "w") as out:
        out.write(artifact_text)

    logged_artifact_path = "artifact"
    with kiwi.start_run():
        kiwi.log_artifact(local_path=local_artifact_path, artifact_path=logged_artifact_path)
        artifact_uri = kiwi.get_artifact_uri(artifact_path=logged_artifact_path)

    downloaded_artifact_path = os.path.join(
        _download_artifact_from_uri(artifact_uri), artifact_file_name)
    assert downloaded_artifact_path != local_artifact_path
    assert downloaded_artifact_path != logged_artifact_path
    with open(downloaded_artifact_path, "r") as f:
        assert f.read() == artifact_text
Example #4
0
def test_download_artifacts_from_uri():
    with kiwi.start_run() as run:
        with TempDir() as tmp:
            local_path = tmp.path("test")
            with open(local_path, "w") as f:
                f.write("test")
            kiwi.log_artifact(local_path, "test")
    command = ["mlflow", "artifacts", "download", "-u"]
    # Test with run uri
    run_uri = "runs:/{run_id}/test".format(run_id=run.info.run_id)
    actual_uri = posixpath.join(run.info.artifact_uri, "test")
    for uri in (run_uri, actual_uri):
        p = Popen(command + [uri], stdout=PIPE,
                  stderr=STDOUT)
        output = p.stdout.readlines()
        downloaded_file_path = output[-1].strip()
        downloaded_file = os.listdir(downloaded_file_path)[0]
        with open(os.path.join(downloaded_file_path, downloaded_file), "r") as f:
            assert f.read() == "test"
Example #5
0
def load_raw_data(url):
    with kiwi.start_run() as mlrun:
        local_dir = tempfile.mkdtemp()
        local_filename = os.path.join(local_dir, "ml-20m.zip")
        print("Downloading %s to %s" % (url, local_filename))
        r = requests.get(url, stream=True)
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    f.write(chunk)

        extracted_dir = os.path.join(local_dir, 'ml-20m')
        print("Extracting %s into %s" % (local_filename, extracted_dir))
        with zipfile.ZipFile(local_filename, 'r') as zip_ref:
            zip_ref.extractall(local_dir)

        ratings_file = os.path.join(extracted_dir, 'ratings.csv')

        print("Uploading ratings: %s" % ratings_file)
        kiwi.log_artifact(ratings_file, "ratings-csv-dir")
Example #6
0
def test_download_artifact_from_absolute_uri_persists_data_to_specified_output_directory(tmpdir):
    artifact_file_name = "artifact.txt"
    artifact_text = "Sample artifact text"
    local_artifact_path = tmpdir.join(artifact_file_name).strpath
    with open(local_artifact_path, "w") as out:
        out.write(artifact_text)

    logged_artifact_subdir = "logged_artifact"
    with kiwi.start_run():
        kiwi.log_artifact(local_path=local_artifact_path, artifact_path=logged_artifact_subdir)
        artifact_uri = kiwi.get_artifact_uri(artifact_path=logged_artifact_subdir)

    artifact_output_path = tmpdir.join("artifact_output").strpath
    os.makedirs(artifact_output_path)
    _download_artifact_from_uri(artifact_uri=artifact_uri, output_path=artifact_output_path)
    assert logged_artifact_subdir in os.listdir(artifact_output_path)
    assert artifact_file_name in os.listdir(
        os.path.join(artifact_output_path, logged_artifact_subdir))
    with open(os.path.join(
            artifact_output_path, logged_artifact_subdir, artifact_file_name), "r") as f:
        assert f.read() == artifact_text
Example #7
0
def test_log_artifact():
    artifact_src_dir = tempfile.mkdtemp()
    # Create artifacts
    _, path0 = tempfile.mkstemp(dir=artifact_src_dir)
    _, path1 = tempfile.mkstemp(dir=artifact_src_dir)
    for i, path in enumerate([path0, path1]):
        with open(path, "w") as handle:
            handle.write("%s" % str(i))
    # Log an artifact, verify it exists in the directory returned by get_artifact_uri
    # after the run finishes
    artifact_parent_dirs = ["some_parent_dir", None]
    for parent_dir in artifact_parent_dirs:
        with start_run():
            artifact_uri = kiwi.get_artifact_uri()
            run_artifact_dir = local_file_uri_to_path(artifact_uri)
            kiwi.log_artifact(path0, parent_dir)
        expected_dir = os.path.join(run_artifact_dir, parent_dir) \
            if parent_dir is not None else run_artifact_dir
        assert os.listdir(expected_dir) == [os.path.basename(path0)]
        logged_artifact_path = os.path.join(expected_dir, path0)
        assert filecmp.cmp(logged_artifact_path, path0, shallow=False)
    # Log multiple artifacts, verify they exist in the directory returned by get_artifact_uri
    for parent_dir in artifact_parent_dirs:
        with start_run():
            artifact_uri = kiwi.get_artifact_uri()
            run_artifact_dir = local_file_uri_to_path(artifact_uri)

            kiwi.log_artifacts(artifact_src_dir, parent_dir)
        # Check that the logged artifacts match
        expected_artifact_output_dir = os.path.join(run_artifact_dir, parent_dir) \
            if parent_dir is not None else run_artifact_dir
        dir_comparison = filecmp.dircmp(artifact_src_dir,
                                        expected_artifact_output_dir)
        assert len(dir_comparison.left_only) == 0
        assert len(dir_comparison.right_only) == 0
        assert len(dir_comparison.diff_files) == 0
        assert len(dir_comparison.funny_files) == 0
Example #8
0
def test_log_artifact_with_dirs(tmpdir):
    # Test log artifact with a directory
    art_dir = tmpdir.mkdir("parent")
    file0 = art_dir.join("file0")
    file0.write("something")
    file1 = art_dir.join("file1")
    file1.write("something")
    sub_dir = art_dir.mkdir("child")
    with start_run():
        artifact_uri = kiwi.get_artifact_uri()
        run_artifact_dir = local_file_uri_to_path(artifact_uri)
        kiwi.log_artifact(str(art_dir))
        base = os.path.basename(str(art_dir))
        assert os.listdir(run_artifact_dir) == [base]
        assert set(os.listdir(os.path.join(run_artifact_dir, base))) == \
            {'child', 'file0', 'file1'}
        with open(os.path.join(run_artifact_dir, base, "file0")) as f:
            assert f.read() == "something"
    # Test log artifact with directory and specified parent folder
    art_dir = tmpdir.mkdir("dir")
    with start_run():
        artifact_uri = kiwi.get_artifact_uri()
        run_artifact_dir = local_file_uri_to_path(artifact_uri)
        kiwi.log_artifact(str(art_dir), "some_parent")
        assert os.listdir(run_artifact_dir) == [
            os.path.basename("some_parent")
        ]
        assert os.listdir(os.path.join(run_artifact_dir, "some_parent")) == \
            [os.path.basename(str(art_dir))]
    sub_dir = art_dir.mkdir("another_dir")
    with start_run():
        artifact_uri = kiwi.get_artifact_uri()
        run_artifact_dir = local_file_uri_to_path(artifact_uri)
        kiwi.log_artifact(str(art_dir), "parent/and_child")
        assert os.listdir(os.path.join(run_artifact_dir, "parent", "and_child")) == \
            [os.path.basename(str(art_dir))]
        assert os.listdir(os.path.join(run_artifact_dir,
                                       "parent", "and_child",
                                       os.path.basename(str(art_dir)))) == \
            [os.path.basename(str(sub_dir))]
Example #9
0
import tempfile

import kiwi
from kiwi import log_metric, log_param, log_artifacts, get_artifact_uri, active_run,\
    get_tracking_uri, log_artifact

if __name__ == "__main__":
    print("Running {} with tracking URI {}".format(sys.argv[0],
                                                   get_tracking_uri()))
    log_param("param1", 5)
    log_metric("foo", 5)
    log_metric("foo", 6)
    log_metric("foo", 7)
    log_metric("random_int", random.randint(0, 100))
    run_id = active_run().info.run_id
    # Get run metadata & data from the tracking server
    service = kiwi.tracking.MlflowClient()
    run = service.get_run(run_id)
    print("Metadata & data for run with UUID %s: %s" % (run_id, run))
    local_dir = tempfile.mkdtemp()
    message = "test artifact written during run %s within artifact URI %s\n" \
              % (active_run().info.run_id, get_artifact_uri())
    try:
        file_path = os.path.join(local_dir, "some_output_file.txt")
        with open(file_path, "w") as handle:
            handle.write(message)
        log_artifacts(local_dir, "some_subdir")
        log_artifact(file_path, "another_dir")
    finally:
        shutil.rmtree(local_dir)
Example #10
0
    print("Computing regularization path using the elastic net.")
    alphas_enet, coefs_enet, _ = enet_path(X,
                                           y,
                                           eps=eps,
                                           l1_ratio=l1_ratio,
                                           fit_intercept=False)

    # Display results
    fig = plt.figure(1)
    ax = plt.gca()

    colors = cycle(['b', 'r', 'g', 'c', 'k'])
    neg_log_alphas_enet = -np.log10(alphas_enet)
    for coef_e, c in zip(coefs_enet, colors):
        l2 = plt.plot(neg_log_alphas_enet, coef_e, linestyle='--', c=c)

    plt.xlabel('-Log(alpha)')
    plt.ylabel('coefficients')
    title = 'ElasticNet Path by alpha for l1_ratio = ' + str(l1_ratio)
    plt.title(title)
    plt.axis('tight')

    # Save figures
    fig.savefig("ElasticNet-paths.png")

    # Close plot
    plt.close(fig)

    # Log artifacts (output files)
    kiwi.log_artifact("ElasticNet-paths.png")
Example #11
0
def run(training_data, max_runs, batch_size, max_p, epochs, metric, gpy_model,
        gpy_acquisition, initial_design, seed):
    bounds = [
        {
            'name': 'lr',
            'type': 'continuous',
            'domain': (1e-5, 1e-1)
        },
        {
            'name': 'momentum',
            'type': 'continuous',
            'domain': (0.0, 1.0)
        },
    ]
    # create random file to store run ids of the training tasks
    tracking_client = kiwi.tracking.MlflowClient()

    def new_eval(nepochs,
                 experiment_id,
                 null_train_loss,
                 null_valid_loss,
                 null_test_loss,
                 return_all=False):
        """
        Create a new eval function

        :param nepochs: Number of epochs to train the model.
        :experiment_id: Experiment id for the training run
        :valid_null_loss: Loss of a null model on the validation dataset
        :test_null_loss: Loss of a null model on the test dataset.
        :return_test_loss: Return both validation and test loss if set.

        :return: new eval function.
        """
        def eval(params):
            """
            Train Keras model with given parameters by invoking MLflow run.

            Notice we store runUuid and resulting metric in a file. We will later use these to pick
            the best run and to log the runUuids of the child runs as an artifact. This is a
            temporary workaround until MLflow offers better mechanism of linking runs together.

            :param params: Parameters to the train_keras script we optimize over:
                          learning_rate, drop_out_1
            :return: The metric value evaluated on the validation data.
            """
            lr, momentum = params[0]
            with kiwi.start_run(nested=True) as child_run:
                p = kiwi.projects.run(run_id=child_run.info.run_id,
                                      uri=".",
                                      entry_point="train",
                                      parameters={
                                          "training_data": training_data,
                                          "epochs": str(nepochs),
                                          "learning_rate": str(lr),
                                          "momentum": str(momentum),
                                          "seed": str(seed)
                                      },
                                      experiment_id=experiment_id,
                                      synchronous=False)
                succeeded = p.wait()
            if succeeded:
                training_run = tracking_client.get_run(p.run_id)
                metrics = training_run.data.metrics

                # cap the loss at the loss of the null model
                train_loss = min(null_valid_loss,
                                 metrics["train_{}".format(metric)])
                valid_loss = min(null_valid_loss,
                                 metrics["val_{}".format(metric)])
                test_loss = min(null_test_loss,
                                metrics["test_{}".format(metric)])
            else:
                # run failed => return null loss
                tracking_client.set_terminated(p.run_id, "FAILED")
                train_loss = null_train_loss
                valid_loss = null_valid_loss
                test_loss = null_test_loss

            kiwi.log_metrics({
                "train_{}".format(metric): train_loss,
                "val_{}".format(metric): valid_loss,
                "test_{}".format(metric): test_loss
            })

            if return_all:
                return train_loss, valid_loss, test_loss
            else:
                return valid_loss

        return eval

    with kiwi.start_run() as run:
        experiment_id = run.info.experiment_id
        # Evaluate null model first.
        # We use null model (predict everything to the mean) as a reasonable upper bound on loss.
        # We need an upper bound to handle the failed runs (e.g. return NaNs) because GPyOpt can not
        # handle Infs.
        # Always including a null model in our results is also a good ML practice.
        train_null_loss, valid_null_loss, test_null_loss = new_eval(
            0, experiment_id, _inf, _inf, _inf, True)(params=[[0, 0]])
        myProblem = GPyOpt.methods.BayesianOptimization(
            new_eval(epochs, experiment_id, train_null_loss, valid_null_loss,
                     test_null_loss),
            bounds,
            evaluator_type="local_penalization"
            if min(batch_size, max_p) > 1 else "sequential",
            batch_size=batch_size,
            num_cores=max_p,
            model_type=gpy_model,
            acquisition_type=gpy_acquisition,
            initial_design_type=initial_design,
            initial_design_numdata=max_runs >> 2,
            exact_feval=False)
        myProblem.run_optimization(max_runs)
        matplotlib.use('agg')
        plt.switch_backend('agg')
        with TempDir() as tmp:
            acquisition_plot = tmp.path("acquisition_plot.png")
            convergence_plot = tmp.path("convergence_plot.png")
            myProblem.plot_acquisition(filename=acquisition_plot)
            myProblem.plot_convergence(filename=convergence_plot)
            if os.path.exists(convergence_plot):
                kiwi.log_artifact(convergence_plot, "converegence_plot")
            if os.path.exists(acquisition_plot):
                kiwi.log_artifact(acquisition_plot, "acquisition_plot")

        # find the best run, log its metrics as the final metrics of this run.
        client = MlflowClient()
        runs = client.search_runs(
            [experiment_id], "tags.mlflow.parentRunId = '{run_id}' ".format(
                run_id=run.info.run_id))
        best_val_train = _inf
        best_val_valid = _inf
        best_val_test = _inf
        best_run = None
        for r in runs:
            if r.data.metrics["val_rmse"] < best_val_valid:
                best_run = r
                best_val_train = r.data.metrics["train_rmse"]
                best_val_valid = r.data.metrics["val_rmse"]
                best_val_test = r.data.metrics["test_rmse"]
        kiwi.set_tag("best_run", best_run.info.run_id)
        kiwi.log_metrics({
            "train_{}".format(metric): best_val_train,
            "val_{}".format(metric): best_val_valid,
            "test_{}".format(metric): best_val_test
        })
Example #12
0
def call_tracking_apis():
    kiwi.log_metric("some_key", 3)
    with tempfile.NamedTemporaryFile("w") as temp_file:
        temp_file.write("Temporary content.")
        kiwi.log_artifact(temp_file.name)