def etl_data(ratings_csv, max_row_limit): with kiwi.start_run() as mlrun: tmpdir = tempfile.mkdtemp() ratings_parquet_dir = os.path.join(tmpdir, 'ratings-parquet') spark = pyspark.sql.SparkSession.builder.getOrCreate() print("Converting ratings CSV %s to Parquet %s" % (ratings_csv, ratings_parquet_dir)) ratings_df = spark.read \ .option("header", "true") \ .option("inferSchema", "true") \ .csv(ratings_csv) \ .drop("timestamp") # Drop unused column ratings_df.show() if max_row_limit != -1: ratings_df = ratings_df.limit(max_row_limit) ratings_df.write.parquet(ratings_parquet_dir) print("Uploading Parquet ratings: %s" % ratings_parquet_dir) kiwi.log_artifacts(ratings_parquet_dir, "ratings-parquet-dir")
def test_log_artifact(): artifact_src_dir = tempfile.mkdtemp() # Create artifacts _, path0 = tempfile.mkstemp(dir=artifact_src_dir) _, path1 = tempfile.mkstemp(dir=artifact_src_dir) for i, path in enumerate([path0, path1]): with open(path, "w") as handle: handle.write("%s" % str(i)) # Log an artifact, verify it exists in the directory returned by get_artifact_uri # after the run finishes artifact_parent_dirs = ["some_parent_dir", None] for parent_dir in artifact_parent_dirs: with start_run(): artifact_uri = kiwi.get_artifact_uri() run_artifact_dir = local_file_uri_to_path(artifact_uri) kiwi.log_artifact(path0, parent_dir) expected_dir = os.path.join(run_artifact_dir, parent_dir) \ if parent_dir is not None else run_artifact_dir assert os.listdir(expected_dir) == [os.path.basename(path0)] logged_artifact_path = os.path.join(expected_dir, path0) assert filecmp.cmp(logged_artifact_path, path0, shallow=False) # Log multiple artifacts, verify they exist in the directory returned by get_artifact_uri for parent_dir in artifact_parent_dirs: with start_run(): artifact_uri = kiwi.get_artifact_uri() run_artifact_dir = local_file_uri_to_path(artifact_uri) kiwi.log_artifacts(artifact_src_dir, parent_dir) # Check that the logged artifacts match expected_artifact_output_dir = os.path.join(run_artifact_dir, parent_dir) \ if parent_dir is not None else run_artifact_dir dir_comparison = filecmp.dircmp(artifact_src_dir, expected_artifact_output_dir) assert len(dir_comparison.left_only) == 0 assert len(dir_comparison.right_only) == 0 assert len(dir_comparison.diff_files) == 0 assert len(dir_comparison.funny_files) == 0
test_loss, correct, len(test_loader.dataset), test_accuracy)) step = (epoch + 1) * len(train_loader) log_scalar('test_loss', test_loss, step) log_scalar('test_accuracy', test_accuracy, step) def log_scalar(name, value, step): """Log a scalar value to both MLflow and TensorBoard""" writer.add_scalar(name, value, step) kiwi.log_metric(name, value) with kiwi.start_run(): # Log our parameters into mlflow for key, value in vars(args).items(): kiwi.log_param(key, value) # Create a SummaryWriter to write TensorBoard events locally output_dir = dirpath = tempfile.mkdtemp() writer = SummaryWriter(output_dir) print("Writing TensorBoard events locally to %s\n" % output_dir) # Perform the training for epoch in range(1, args.epochs + 1): train(epoch) test(epoch) # Upload the TensorBoard event logs as a run artifact print("Uploading TensorBoard events as a run artifact...") kiwi.log_artifacts(output_dir, artifact_path="events") print("\nLaunch TensorBoard with:\n\ntensorboard --logdir=%s" % os.path.join(kiwi.get_artifact_uri(), "events"))
import tempfile import kiwi from kiwi import log_metric, log_param, log_artifacts, get_artifact_uri, active_run,\ get_tracking_uri, log_artifact if __name__ == "__main__": print("Running {} with tracking URI {}".format(sys.argv[0], get_tracking_uri())) log_param("param1", 5) log_metric("foo", 5) log_metric("foo", 6) log_metric("foo", 7) log_metric("random_int", random.randint(0, 100)) run_id = active_run().info.run_id # Get run metadata & data from the tracking server service = kiwi.tracking.MlflowClient() run = service.get_run(run_id) print("Metadata & data for run with UUID %s: %s" % (run_id, run)) local_dir = tempfile.mkdtemp() message = "test artifact written during run %s within artifact URI %s\n" \ % (active_run().info.run_id, get_artifact_uri()) try: file_path = os.path.join(local_dir, "some_output_file.txt") with open(file_path, "w") as handle: handle.write(message) log_artifacts(local_dir, "some_subdir") log_artifact(file_path, "another_dir") finally: shutil.rmtree(local_dir)
import os from random import random, randint from kiwi import log_metric, log_param, log_artifacts if __name__ == "__main__": print("Running mlflow_tracking.py") log_param("param1", randint(0, 100)) log_metric("foo", random()) log_metric("foo", random() + 1) log_metric("foo", random() + 2) if not os.path.exists("outputs"): os.makedirs("outputs") with open("outputs/test.txt", "w") as f: f.write("hello world!") log_artifacts("outputs")