def start_experiment(self, engine): # log the initial artifacts in the dir mlflow.log_artifacts(args.logdir, "training") mlflow.log_metric("finished", False)
def on_train_end(self, args, state, control, **kwargs): if self._initialized and state.is_world_process_zero: if self._log_artifacts: logger.info("Logging artifacts. This may take time.") mlflow.log_artifacts(args.output_dir) mlflow.end_run()
log_scalar('test_loss', test_loss, step) log_scalar('test_accuracy', test_accuracy, step) def log_scalar(name, value, step): """Log a scalar value to both MLflow and TensorBoard""" writer.add_scalar(name, value, step) mlflow.log_metric(name, value) with mlflow.start_run(): # Log our parameters into mlflow for key, value in vars(args).items(): mlflow.log_param(key, value) # Create a SummaryWriter to write TensorBoard events locally output_dir = dirpath = tempfile.mkdtemp() writer = SummaryWriter(output_dir) print("Writing TensorBoard events locally to %s\n" % output_dir) # Perform the training for epoch in range(1, args.epochs + 1): train(epoch) test(epoch) # Upload the TensorBoard event logs as a run artifact print("Uploading TensorBoard events as a run artifact...") mlflow.log_artifacts(output_dir, artifact_path="events") print("\nLaunch TensorBoard with:\n\ntensorboard --logdir=%s" % os.path.join(mlflow.get_artifact_uri(), "events"))
def run_evaluation( *, model_hash: str, thr: float = 0.5, dpath: str = "datasets/clouds/38-Cloud/38-Cloud_test", gtpath: str = "datasets/clouds/38-Cloud/38-Cloud_test/Entire_scene_gts", vpath: str = "datasets/clouds/38-Cloud/38-Cloud_test/Natural_False_Color", rpath: str = "artifacts/", vids: ("v", multi(min=1)), batch_size: int = 32, img_ids: ("iid", multi(min=0)), mlflow: bool = False, run_name: str = None, ): """ Load model given model hash and get evaluation metrics on 38-Cloud testset. :param model_hash: MLFlow hash of the model to load. :param thr: threshold to be used during evaluation. :param dpath: path to dataset. :param gtpath: path to dataset ground truths. :param vpath: path to dataset (false color) visualisation images. :param rpath: path to directory where results and artifacts should be logged. :param vids: tuple of ids of images which should be used to create visualisations. If contains '*' visualisations will be created for all images in the dataset. :type vids: tuple[str] :param batch_size: size of generated batches, only one batch is loaded to memory at a time. :param img_ids: if given, process only these images. :type img_ids: list[int] :param mlflow: whether to use MLFlow. :param run_name: name of the run. """ snow_imgs = [ "LC08_L1TP_064015_20160420_20170223_01_T1", "LC08_L1TP_035035_20160120_20170224_01_T1", "LC08_L1TP_050024_20160520_20170324_01_T1", ] if img_ids == []: img_ids = None else: snow_imgs = list(set(snow_imgs) & set(img_ids)) dpath, gtpath, vpath, rpath = make_paths(dpath, gtpath, vpath, rpath) rpath = rpath / uuid.uuid4().hex print(f"Working dir: {os.getcwd()}, " + f"artifacts dir: {rpath}", flush=True) mpath = Path( f"/media/ML/mlflow/beetle/artifacts/34/{model_hash}/" # Change init_model to model for old models + "artifacts/init_model/data/model.h5" ) # WARNING: If ever upgraded to tf2.x, this way of using metrics will not work # because loaded metrics will become MeanMetricWrapper objects in tf2.x and # this script isn't prepared for such objects (because MeanMetricWrapper has state, # as opposed to present stateless metric functions). model = keras.models.load_model( mpath, custom_objects={ "jaccard_index_loss": losses.JaccardIndexLoss(), "jaccard_index_metric": losses.JaccardIndexMetric(), "dice_coeff_metric": losses.DiceCoefMetric(), "recall": losses.recall, "precision": losses.precision, "specificity": losses.specificity, # F1 score is needed for old models # "f1_score": losses.f1_score, "tf": tf, }, ) model.load_weights( f"/media/ML/mlflow/beetle/artifacts/34/{model_hash}/" + "artifacts/best_weights/best_weights" ) metrics, _ = evaluate_model( model=model, thr=thr, dpath=dpath, gtpath=gtpath, vpath=vpath, rpath=rpath, vids=vids, batch_size=batch_size, img_ids=img_ids, mlflow=mlflow, run_name=run_name, ) mean_metrics = {} mean_metrics_snow = {} for key, value in metrics.items(): mean_metrics[key] = np.mean(list(value.values())) mean_metrics_snow[f"snow_{key}"] = np.mean( [value[x] for x in snow_imgs]) print(mean_metrics, mean_metrics_snow) if mlflow: log_metrics(mean_metrics) log_metrics(mean_metrics_snow) log_artifacts(rpath)
def log_artifacts(cls, dir_path, artifact_path=None): mlflow.log_artifacts(dir_path, artifact_path)
def main(): with mlflow.start_run(run_name="Gen Expert Data"): import argparse parser = argparse.ArgumentParser() parser.add_argument('expert_policy_file', type=str) parser.add_argument('envname', type=str) parser.add_argument('--render', type=str2bool, nargs='?', const=True, default=False) parser.add_argument("--max_timesteps", type=int) parser.add_argument('--num_rollouts', type=int, default=20, help='Number of expert roll outs') args = parser.parse_args() for k, v in vars(args).items(): mlflow.log_param(k, v) print('loading and building expert policy') policy_fn = load_policy.load_policy(args.expert_policy_file) print('loaded and built') with tf.Session(): tf_util.initialize() returns = [] observations = [] actions = [] steps = [] for i in range(args.num_rollouts): env = gym.make(args.envname) if args.render: video_dir = "./videos/{0}/".format(time()) env = wrappers.Monitor(env, video_dir, force=True) max_steps = args.max_timesteps or env.spec.max_episode_steps print("max_steps set to {0}".format(max_steps)) print('iter', i) obs = env.reset() done = False totalr = 0. trial_steps = 0 while not done: action = policy_fn(obs[None, :]) #action = env.action_space.sample() observations.append(obs) actions.append(action) obs, r, done, _ = env.step(action) totalr += r trial_steps += 1 if args.render: env.render(mode='rgb_array') if trial_steps % 100 == 0: print("%i/%i" % (trial_steps, max_steps)) if trial_steps >= max_steps: print("hit max_steps") break returns.append(totalr) steps.append(trial_steps) env.close() if args.render: mlflow.log_artifacts(video_dir) for s in steps: mlflow.log_metric('steps', s) for r in returns: mlflow.log_metric('returns', r) mlflow.log_metric('mean return', np.mean(returns)) mlflow.log_metric('std of return', np.std(returns)) expert_data = { 'observations': np.array(observations), 'actions': np.array(actions) } if not os.path.exists('expert_data'): os.makedirs('expert_data') filename = os.path.join('expert_data', args.envname + '.pkl') with open((filename), 'wb') as f: pickle.dump(expert_data, f, pickle.HIGHEST_PROTOCOL) mlflow.log_artifact(filename, artifact_path="expert_data_file")
def train_models(self, args, base_line=True): """ Train the model and log all the MLflow Metrics :param args: command line arguments. If no arguments then use default :param base_line: Default flag. Create Baseline model """ # Create TensorFlow Session sess = tf.InteractiveSession() # Configure output_dir output_dir = tempfile.mkdtemp() # # initialize some classes # kdata_cls = KIMDB_Data_Utils() ktrain_cls = KTrain() kplot_cls = KPlot() # # get IMDB Data # (train_data, train_labels), (test_data, test_labels) = kdata_cls.fetch_imdb_data() # # prepare and vectorize data # x_train = kdata_cls.prepare_vectorized_sequences(train_data) x_test = kdata_cls.prepare_vectorized_sequences(test_data) y_train = kdata_cls.prepare_vectorized_labels(train_labels) y_test = kdata_cls.prepare_vectorized_labels(test_labels) image_dir = ktrain_cls.get_directory_path("images") model_dir = ktrain_cls.get_directory_path("models") graph_label_loss = 'Baseline Model: Training and Validation Loss' graph_label_acc = 'Baseline Model: Training and Validation Accuracy' graph_image_loss_png = os.path.join(image_dir, 'baseline_loss.png') graph_image_acc_png = os.path.join(image_dir, 'baseline_accuracy.png') if not base_line: graph_label_loss = 'Experimental: Training and Validation Loss' graph_label_acc = 'Experimental Model: Training and Validation Accuracy' graph_image_loss_png = os.path.join(image_dir, 'experimental_loss.png') graph_image_acc_png = os.path.join(image_dir, 'experimental_accuracy.png') kmodel = KModel() if base_line: print("Baseline Model:") model = kmodel.build_basic_model() else: print("Experiment Model:") model = kmodel.build_experimental_model(args.hidden_layers, args.output) history = ktrain_cls.compile_and_fit_model(model, x_train, y_train, epochs=args.epochs, loss=args.loss, output_dir=output_dir) model.summary() ktrain_cls.print_metrics(history) figure_loss = kplot_cls.plot_loss_graph(history, graph_label_loss) figure_loss.savefig(graph_image_loss_png) figure_acc = kplot_cls.plot_accuracy_graph(history, graph_label_acc) figure_acc.savefig(graph_image_acc_png) results = ktrain_cls.evaluate_model(model, x_test, y_test) print("Average Probability Results:") print(results) print() print("Predictions Results:") predictions = model.predict(x_test) print(predictions) mlflow_server = args.tracking_server # # We don't want to force people to have tracking server # running on localhost as it tracks in mlruns directory if mlflow_server: # Tracking URI if not mlflow_server.startswith("http"): mlflow_tracking_uri = 'http://' + mlflow_server + ':5000' else: mlflow_tracking_uri = mlflow_server # Set the Tracking URI mlflow.set_tracking_uri(mlflow_tracking_uri) print("MLflow Tracking URI: %s" % mlflow_tracking_uri) else: print("MLflow Tracking URI: %s" % "local directory 'mlruns'") with mlflow.start_run(): # print out current run_uuid run_uuid = mlflow.active_run().info.run_uuid print("MLflow Run ID: %s" % run_uuid) # log parameters mlflow.log_param("hidden_layers", args.hidden_layers) mlflow.log_param("output", args.output) mlflow.log_param("epochs", args.epochs) mlflow.log_param("loss_function", args.loss) # calculate metrics binary_loss = ktrain_cls.get_binary_loss(history) binary_acc = ktrain_cls.get_binary_acc(history) validation_loss = ktrain_cls.get_validation_loss(history) validation_acc = ktrain_cls.get_validation_acc(history) average_loss = results[0] average_acc = results[1] # log metrics mlflow.log_metric("binary_loss", binary_loss) mlflow.log_metric("binary_acc", binary_acc) mlflow.log_metric("validation_loss", validation_loss) mlflow.log_metric("validation_acc", validation_acc) mlflow.log_metric("average_loss", average_loss) mlflow.log_metric("average_acc", average_acc) # log artifacts mlflow.log_artifacts(image_dir, "images") # log model mlflow.keras.log_model(model, "models") # save model locally pathdir = "keras_models/" + run_uuid model_dir = self.get_directory_path(pathdir, False) ktrain_cls.keras_save_model(model, model_dir) # Write out TensorFlow events as a run artifact print("Uploading TensorFlow events as a run artifact.") mlflow.log_artifacts(output_dir, artifact_path="events") print("loss function use", args.loss)
numpy.ones((points.shape[0], 1), dtype=numpy.float64)))) for t, points in zip(timepoints, samples)] ret = list( scopyon.generate_images(inputs, num_frames=num_frames, config=config, rng=rng, full_output=True)) inputs_ = [] for t, data in inputs: inputs_.extend(([t] + list(row) for row in data)) inputs_ = numpy.array(inputs_) numpy.save(artifacts / f"inputs{i:03d}.npy", inputs_) numpy.save(artifacts / f"images{i:03d}.npy", numpy.array([img.as_array() for img, infodict in ret])) true_data = [] for t, (_, infodict) in zip(timepoints, ret): true_data.extend([t, key] + list(value) for key, value in infodict['true_data'].items()) true_data = numpy.array(true_data) numpy.save(artifacts / f"true_data{i:03d}.npy", true_data) #!ls ./artifacts log_artifacts(artifactsPath) mlflow.end_run()
def train_models(self, args, base_line=True): """ Train the model and log all the MLflow Metrics :param args: command line arguments. If no arguments then use default :param base_line: Default flag. Create Baseline model """ # # initialize some classes # kdata_cls = KIMDB_Data_Utils() ktrain_cls = KTrain() kplot_cls = KPlot() start_time = time() # # get IMDB Data # (train_data, train_labels), (test_data, test_labels) = kdata_cls.fetch_imdb_data() # # prepare and vectorize data # x_train = kdata_cls.prepare_vectorized_sequences(train_data) x_test = kdata_cls.prepare_vectorized_sequences(test_data) y_train = kdata_cls.prepare_vectorized_labels(train_labels) y_test = kdata_cls.prepare_vectorized_labels(test_labels) image_dir = ktrain_cls.get_directory_path("images") model_dir = ktrain_cls.get_directory_path("models") graph_label_loss = 'Baseline Model: Training and Validation Loss' graph_label_acc = 'Baseline Model: Training and Validation Accuracy' graph_image_loss_png = os.path.join(image_dir,'baseline_loss.png') graph_image_acc_png = os.path.join(image_dir, 'baseline_accuracy.png') if not base_line: graph_label_loss = 'Experimental: Training and Validation Loss' graph_label_acc = 'Experimental Model: Training and Validation Accuracy' graph_image_loss_png = os.path.join(image_dir, 'experimental_loss.png') graph_image_acc_png = os.path.join(image_dir,'experimental_accuracy.png') kmodel = KModel() if base_line: print("Baseline Model:") model = kmodel.build_basic_model() else: print("Experiment Model:") model = kmodel.build_experimental_model(args.hidden_layers, args.output) history = ktrain_cls.compile_and_fit_model(model, x_train, y_train, epochs=args.epochs, loss=args.loss) model.summary() ktrain_cls.print_metrics(history) figure_loss = kplot_cls.plot_loss_graph(history, graph_label_loss) figure_loss.savefig(graph_image_loss_png ) figure_acc = kplot_cls.plot_accuracy_graph(history, graph_label_acc) figure_acc.savefig(graph_image_acc_png) results = ktrain_cls.evaulate_model(model, x_test, y_test) print("Average Probability Results:") print(results) print() print("Predictions Results:") predictions = model.predict(x_test) print(predictions) timed = time() - start_time with mlflow.start_run(): # log parameters mlflow.log_param("hidden_layers", args.hidden_layers) mlflow.log_param("output", args.output) mlflow.log_param("epochs", args.epochs) mlflow.log_param("loss_function", args.loss) # log metrics mlflow.log_metric("binary_loss", ktrain_cls.get_binary_loss(history)) mlflow.log_metric("binary_acc", ktrain_cls.get_binary_acc(history)) mlflow.log_metric("validation_loss", ktrain_cls.get_binary_loss(history)) mlflow.log_metric("validation_acc", ktrain_cls.get_validation_acc(history)) mlflow.log_metric("average_loss", results[0]) mlflow.log_metric("average_acc", results[1]) # log artifacts mlflow.log_artifacts(image_dir, "images") # log model mlflow.keras.log_model(model, "models") print("This model took", timed, " seconds to train and test.") print("loss function use", args.loss)
def train_model(args, base_line=True): ''' Train model function ''' graph_label_loss = 'Baseline Model: Training and Validation Loss' graph_label_acc = 'Baseline Model: Training and Validation Accuracy' graph_image_loss_png = os.path.join(image_dir, 'baseline_loss.png') graph_image_acc_png = os.path.join(image_dir, 'baseline_accuracy.png') if not base_line: graph_label_loss = 'Experimental: Training and Validation Loss' graph_label_acc = 'Experimental Model: Training and Validation Accuracy' graph_image_loss_png = os.path.join(image_dir, 'experimental_loss.png') graph_image_acc_png = os.path.join(image_dir, 'experimental_accuracy.png') image_data_generator = tf.keras.preprocessing.image.ImageDataGenerator( validation_split=validation_split) train_generator = image_data_generator.flow_from_directory( TRAIN_DATA_DIR, target_size=(TRAIN_IMAGE_SIZE, TRAIN_IMAGE_SIZE), batch_size=TRAIN_BATCH_SIZE, class_mode='categorical', subset='training') validation_generator = image_data_generator.flow_from_directory( TRAIN_DATA_DIR, target_size=(TRAIN_IMAGE_SIZE, TRAIN_IMAGE_SIZE), batch_size=TRAIN_BATCH_SIZE, class_mode='categorical', subset='validation') # Create the model model = Sequential() model.add( Conv2D(args.filters, kernel_size=args.kernel_size, activation='relu', padding='same', input_shape=(img_width, img_height, img_num_channels))) model.add(Flatten()) model.add(Dense(args.output, activation='softmax')) # Compile the model model.compile(loss=args.loss, optimizer=args.optimizer, metrics=['accuracy']) history = model.fit_generator(train_generator, epochs=args.epochs, validation_data=validation_generator) model.summary() print_metrics(history) figure_loss = plot_loss_graph(history, graph_label_loss) figure_loss.savefig(graph_image_loss_png) figure_acc = plot_accuracy_graph(history, graph_label_acc) figure_acc.savefig(graph_image_acc_png) # print('==================================================') # predictions = model.predict(TEST_DATA_DIR) # print(predictions) # print('==================================================') #mlflow.set_experiment(args.experiment_name) with mlflow.start_run(): # print out current run_uuid run_uuid = mlflow.active_run().info.run_uuid print("MLflow Run ID: %s" % run_uuid) # mlflow.create_experiment("Training CNN Model", artifact_location=None) # log parameters mlflow.log_param("Filters", args.filters) mlflow.log_param("Kernel Size", args.kernel_size) mlflow.log_param("Output", args.output) mlflow.log_param("Epochs", args.epochs) mlflow.log_param("Loss", args.loss) mlflow.log_param("Optimize", args.optimizer) # calculate metrics binary_loss = get_binary_loss(history) binary_acc = get_binary_acc(history) validation_loss = get_validation_loss(history) validation_acc = get_validation_acc(history) # log metrics mlflow.log_metric("binary_loss", binary_loss) mlflow.log_metric("binary_acc", binary_acc) mlflow.log_metric("validation_loss", validation_loss) mlflow.log_metric("validation_acc", validation_acc) # log artifacts mlflow.log_artifacts(image_dir, "images") # log model mlflow.keras.log_model(model, "models") # save model locally pathdir = "../data/out/keras_models/" + run_uuid # keras_save_model(model, pathdir) # Write out TensorFlow events as a run artifact print("Uploading TensorFlow events as a run artifact.") mlflow.log_artifacts(output_dir, artifact_path="events") mlflow.end_run()
def run(self, args: argparse.Namespace) -> None: logger.info("Load config from %s", args.config) config = load_yaml(minato.cached_path(args.config), args.overrides) logger.info("Configuration: %s", str(config)) builder = ConfigBuilder.build(config) model = builder.model train_file = args.train or builder.train_file validation_file = args.validation or builder.validation_file if not train_file: raise ConfigurationError("train file is required.") logger.info("Start training...") logger.info("Training data: %s", str(train_file)) logger.info("Validation data: %s", str(validation_file)) params = { "command": " ".join(sys.argv), "config_file": args.config, "train_file": train_file, "validation_file": validation_file, "serialization_dir": args.serialization_dir, "config": config, } with _mlflow_start_run(): serialization_dir = args.serialization_dir if args.serialization_dir is None and mlflow is None: serialization_dir = "./output" with create_workdir( serialization_dir, exist_ok=args.force, ) as workdir: workdir = workdir.absolute() try: with open(workdir / "config.yaml", "w") as f: yaml.dump(config, f) with open(workdir / "params.json", "w") as f: json.dump(params, f, indent=2) if mlflow is not None: logger.info("Log params to mlflow") mlflow.log_params(params) metrics = model.train(train_file, validation_file, workdir) if mlflow is not None: logger.info("Log metrics to mlflow") mlflow.log_metrics(metrics) logger.info("Training completed") logger.info("Training metrics: %s", json.dumps(metrics, indent=2)) with open(workdir / "metrics.json", "w") as metrics_file: json.dump(metrics, metrics_file) with open(workdir / "model.pkl", "wb") as model_file: pickle.dump(model, model_file) finally: if mlflow is not None: logger.info("Log metrics to mlflow") mlflow.log_artifacts(str(workdir)) logger.info("Done!")
def main(learning_rate, batch_size, checkpoint_base_path, data_path, tracking_url): checkpoint_path = os.path.join(checkpoint_base_path, current_timestamp()) os.makedirs(checkpoint_path, exist_ok=True) params = { 'hidden_size': 512, 'keep_rate': 0.8, 'learning_rate': learning_rate, 'nb_epochs': 1, 'batch_size': batch_size, 'checkpoint_path': checkpoint_path } # Configure the location where tracking data will be written to. In real-life # this would be a remote MLFlow Tracking Servinc (using HTTP) or something like # S3, HDFS etc. mlflow.set_tracking_uri(tracking_url) # Set name of experiment mlflow.set_experiment('MNIST_TF_Estimator') with mlflow.start_run() as run: # Log parameters in MLFlow for name, value in params.items(): mlflow.log_param(name, value) def train_input_fn(): ds = dataset.train(data_path) ds = ds.shuffle(buffer_size=50000) ds = ds.take(5000) # just to speed up training ds = ds.batch(params['batch_size']) ds = ds.repeat(params['nb_epochs']) return ds def eval_input_fn(): ds = dataset.test(data_path) ds = ds.batch(params['batch_size']) return ds run_config = tf.estimator.RunConfig(log_step_count_steps=50) estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=checkpoint_path, params=params, config=run_config) estimator.train(input_fn=train_input_fn) eval_results = estimator.evaluate(input_fn=eval_input_fn) tf.logging.info('Eval loss: %s' % eval_results['loss']) tf.logging.info('Eval accuracy: %s' % eval_results['accuracy']) # Log results in MLFlow mlflow.log_metric("eval_loss", eval_results['loss']) mlflow.log_metric("eval_acc", eval_results['accuracy']) # Send checkpoint and event files to MLFlow mlflow.log_artifacts(checkpoint_path) # Export the latest checkpoint as SavedModel feat_spec = { "images": tf.placeholder("float", name="images", shape=[None, 784]) } receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn( feat_spec) export_dir_base = os.path.join(checkpoint_path, 'saved_models') saved_estimator_path = estimator.export_saved_model( export_dir_base, receiver_fn).decode("utf-8") tf.logging.info('SavedModel has been exported to %s' % saved_estimator_path) # Log the SavedModel as MLFlow model mlflow.tensorflow.log_model(tf_saved_model_dir=saved_estimator_path, tf_meta_graph_tags=[tag_constants.SERVING], tf_signature_def_key="serving_default", artifact_path="exported_model")
# # Code snippet for https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.log_artifacts # import warnings import os import json import mlflow if __name__ == "__main__": warnings.filterwarnings("ignore") print(mlflow.__version__) # Create some artifacts data to preserve features = "rooms, zipcode, median_price, school_rating, transport" data = {"state": "TX", "Available": 25, "Type": "Detached"} # Create couple of artifact files under the directory "data" os.makedirs("data", exist_ok=True) with open("data/data.json", 'w', encoding='utf-8') as f: json.dump(data, f, indent=2) with open("data/features.txt", 'w') as f: f.write(features) # Write all files in "data" to root artifact_uri/states with mlflow.start_run(): mlflow.log_artifacts("data", artifact_path="states")
def log_artifact(source_dir, artifact_path=None): print(f"[INFO] Logging artifacts in {source_dir}...") mlflow.log_artifacts(local_dir, artifact_path=None)
# Log a metric; metrics can be updated throughout the run mlflow.log_metric("foo", random()) mlflow.log_metric("foo", random()+1) mlflow.log_metric("foo", random()+2) mlflow.log_metric("foo", random()+3) mlflow.log_metric("bar", random()) mlflow.log_metric("bar", random()+1) mlflow.log_metric("bar", random()+2) mlflow.log_metric("bar", random()+3) # Log an artifact (output file) os.makedirs("outputs", exist_ok=True) with open("outputs/test.txt", "w") as f: f.write("hello world!") mlflow.log_artifacts("outputs") mlflow.end_run() # Step 1: Set your Environment Variables get_ipython().run_line_magic('env', 'NEPTUNE_API_TOKEN=ANONYMOUS') get_ipython().run_line_magic('env', 'NEPTUNE_PROJECT=shared/mlflow-integration') # Step 2: Sync your MLruns with Neptune get_ipython().system(' neptune mlflow') # **Note:** # You can specify the path to the directory where the 'mlruns' directory is.
random_state=123) rf.fit(train_resampled.drop(outcome, axis=1), train_resampled[outcome]) # evaluate model train_predictions = rf.predict(train_resampled.drop(outcome, axis=1)) test_predictions = rf.predict(test.drop(outcome, axis=1)) train_auc = roc_auc_score(train_resampled[outcome], train_predictions) test_auc = roc_auc_score(test[outcome], test_predictions) # log data import mlflow import tempfile mlflow.set_tracking_uri('http://localhost:5000') mlflow.set_experiment('case-study-one') mlflow.start_run() mlflow.log_param('n_estimators', n_estimators) mlflow.log_param('max_features', max_features) mlflow.log_metric('train_auc', train_auc) mlflow.log_metric('test_auc', test_auc) with tempfile.TemporaryDirectory() as tmp: path = os.path.join(tmp, 'train.csv') train.to_csv(path) mlflow.log_artifacts(tmp) mlflow.sklearn.log_model(rf, 'model') mlflow.end_run()
def train_elasticnet(datasets, in_alpha, in_l1_ratio, trial=None, verbose=True): from sklearn.linear_model import ElasticNet train_x = datasets['train_x'] train_y = datasets['train_y'] test_x = datasets['test_x'] test_y = datasets['test_y'] alpha, l1_ratio = eval_parameters(in_alpha, in_l1_ratio) print("Parameters (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio)) server_uri = "http://localhost:5000" #mlflow.set_tracking_uri(server_uri) #mlflow.set_experiment("wine6") with mlflow.start_run(): # train with ElasticNet lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42) lr.fit(train_x, train_y) # Evaluate Metrics predicted_qualities = lr.predict(test_x) (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities) # Print out metrics print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio)) print(" RMSE: %s" % rmse) print(" MAE: %s" % mae) print(" R2: %s" % r2) # Log parameter, metrics, and model to MLflow mlflow.log_param("alpha", alpha) mlflow.log_param("l1_ratio", l1_ratio) mlflow.log_metric("rmse", rmse) mlflow.log_metric("mae", mae) mlflow.log_metric("r2", r2) mlflow.set_tag("algo", "ElastiNet") if trial != None: mlflow.set_tag("trial", trial) # store info if verbose: workdir = tempfile.mkdtemp() with tempfile.TemporaryDirectory() as tmpdirname: output_train_data_summary(tmpdirname, train_x, train_y) output_enet_coefs(tmpdirname, train_x.columns, lr) # plots plot_enet_feature_importance(tmpdirname, train_x.columns, lr.coef_) # Call plot_enet_descent_path #image = plot_enet_descent_path(tmpdirname, train_x, train_y, l1_ratio) # Log artifacts (output files) mlflow.log_artifacts(tmpdirname, artifact_path="artifacts") # store model mlflow.sklearn.log_model(lr, "model") return rmse
def log_artifacts(cls, dir_path, artifact_path=None): try: mlflow.log_artifacts(dir_path, artifact_path) except ConnectionError: logger.warning(f"ConnectionError in logging artifacts to MLFlow")
def train_and_evaluate(args): """Trains and evaluates the Keras model. Uses the Keras model defined in model.py and trains on data loaded and preprocessed in utils.py. Saves the trained model in TensorFlow SavedModel format to the path defined in part by the --job-dir argument. History objects returns: {'loss': [0.5699903990809373, 0.3629718415849791], 'acc': [0.78604823, 0.8331693], 'val_loss': [0.3966572880744934, 0.3477487564086914], 'val_acc': [0.8278044, 0.8281116], 'lr': [0.02, 0.015]} Args: args: dictionary of arguments - see get_args() for details """ logging.info('Resume training: {}'.format(args.reuse_job_dir)) if not args.reuse_job_dir: if tf.io.gfile.exists(args.job_dir): tf.io.gfile.rmtree(args.job_dir) logging.info('Deleted job_dir {} to avoid re-use'.format( args.job_dir)) else: logging.info('Reusing job_dir {} if it exists'.format(args.job_dir)) train_x, train_y, eval_x, eval_y = utils.load_data(args.train_files, args.eval_files) # dimensions num_train_examples, input_dim = train_x.shape num_eval_examples = eval_x.shape[0] # Create the Keras Model keras_model = model.create_keras_model(input_dim=input_dim, learning_rate=args.learning_rate) # Pass a numpy array by passing DataFrame.values training_dataset = model.input_fn(features=train_x.values, labels=train_y, shuffle=True, num_epochs=args.num_epochs, batch_size=args.batch_size) # Pass a numpy array by passing DataFrame.values validation_dataset = model.input_fn(features=eval_x.values, labels=eval_y, shuffle=False, num_epochs=args.num_epochs, batch_size=num_eval_examples) start_time = time() # Set MLflow tracking URI if args.mlflow_tracking_uri: mlflow.set_tracking_uri(args.mlflow_tracking_uri) # Train model with mlflow.start_run() as active_run: run_id = active_run.info.run_id # Callbacks class MlflowCallback(tf.keras.callbacks.Callback): # This function will be called after training completes. def on_train_end(self, logs=None): mlflow.log_param('num_layers', len(self.model.layers)) mlflow.log_param('optimizer_name', type(self.model.optimizer).__name__) # MLflow callback mlflow_callback = MlflowCallback() # Setup Learning Rate decay callback. lr_decay_callback = tf.keras.callbacks.LearningRateScheduler( lambda epoch: args.learning_rate + 0.02 * (0.5**(1 + epoch)), verbose=False) # Setup TensorBoard callback. tensorboard_path = os.path.join(args.job_dir, run_id, 'tensorboard') tensorboard_callback = tf.keras.callbacks.TensorBoard(tensorboard_path, histogram_freq=1) history = keras_model.fit( training_dataset, steps_per_epoch=int(num_train_examples / args.batch_size), epochs=args.num_epochs, validation_data=validation_dataset, validation_steps=args.eval_steps, verbose=1, callbacks=[ lr_decay_callback, tensorboard_callback, mlflow_callback ]) metrics = history.history logging.info(metrics) keras_model.summary() mlflow.log_param('train_files', args.train_files) mlflow.log_param('eval_files', args.eval_files) mlflow.log_param('num_epochs', args.num_epochs) mlflow.log_param('batch_size', args.batch_size) mlflow.log_param('learning_rate', args.learning_rate) mlflow.log_param('train_samples', num_train_examples) mlflow.log_param('eval_samples', num_eval_examples) mlflow.log_param('eval_steps', args.eval_steps) mlflow.log_param('steps_per_epoch', int(num_train_examples / args.batch_size)) # Add metrics _mlflow_log_metrics(metrics, 'loss') _mlflow_log_metrics(metrics, 'acc') _mlflow_log_metrics(metrics, 'val_loss') _mlflow_log_metrics(metrics, 'val_acc') _mlflow_log_metrics(metrics, 'lr') # Export SavedModel model_local_path = os.path.join(args.job_dir, run_id, 'model') tf.keras.experimental.export_saved_model(keras_model, model_local_path) # Define artifacts. logging.info('Model exported to: {}'.format(model_local_path)) # MLflow workaround since is unable to read GCS path. # https://github.com/mlflow/mlflow/issues/1765 if model_local_path.startswith('gs://'): logging.info('Creating temp folder') temp = tempfile.mkdtemp() model_deployment.copy_artifacts(model_local_path, temp) model_local_path = os.path.join(temp, 'model') if tensorboard_path.startswith('gs://'): logging.info('Creating temp folder') temp = tempfile.mkdtemp() model_deployment.copy_artifacts(tensorboard_path, temp) tensorboard_path = temp mlflow.tensorflow.log_model(tf_saved_model_dir=model_local_path, tf_meta_graph_tags=[tag_constants.SERVING], tf_signature_def_key='serving_default', artifact_path='model') # Reloading the model pyfunc_model = mlflow.pyfunc.load_model( mlflow.get_artifact_uri('model')) logging.info('Uploading TensorFlow events as a run artifact.') mlflow.log_artifacts(tensorboard_path) logging.info('Launch TensorBoard with:\n\ntensorboard --logdir=%s' % tensorboard_path) duration = time() - start_time mlflow.log_metric('duration', duration) mlflow.end_run() if model_local_path.startswith( 'gs://') and tensorboard_path.startswith('gs://'): shutil.rmtree(model_local_path) shutil.rmtree(tensorboard_path) # Deploy model to AI Platform. if args.deploy_gcp: # Create AI Platform helper instance. if not args.project_id: raise ValueError('No Project is defined') if not args.gcs_bucket: raise ValueError('No GCS bucket') model_helper = model_deployment.AIPlatformModel( project_id=args.project_id) # Copy local model to GCS for deployment. if not model_local_path.startswith('gs://'): model_gcs_path = os.path.join('gs://', args.gcs_bucket, run_id, 'model') model_deployment.copy_artifacts(model_local_path, model_gcs_path) # Create model model_helper.create_model(args.model_name) # Create model version model_helper.deploy_model(model_gcs_path, args.model_name, run_id, args.run_time_version) logging.info('Model deployment in GCP completed') logging.info( 'This model took: {} seconds to train and test.'.format(duration))
def run_evaluation( *, model_hash: str, thr: float = 0.5, dpath: str = "datasets/clouds/Landsat-Cloud-Cover-Assessment-" + "Validation-Data-Partial", rpath: str = "artifacts/", vids: ("v", multi(min=1)), batch_size: int = 32, bands: ("b", multi(min=1)), bands_names: ("bn", multi(min=1)), img_ids: ("iid", multi(min=0)), resize: bool = False, normalize: bool = False, standardize: bool = False, mlflow: bool = False, run_name: str = None, ): """ Load model given model hash and get evaluation metrics on L8CCA testset. :param model_hash: MLFlow hash of the model to load. :param thr: threshold to be used during evaluation. :param dpath: path to dataset. :param rpath: path to directory where results and artifacts should be logged. :param vids: tuple of ids of images which should be used to create visualisations. If contains '*' visualisations will be created for all images in the dataset. :type vids: tuple[str] :param batch_size: size of generated batches, only one batch is loaded to memory at a time. :param bands: band numbers to load :type bands: list[int] :param bands_names: names of the bands to load. Should have the same number of elements as bands. :type bands_names: list[str] :param img_ids: if given, process only these images. :type img_ids: list[int] :param resize: whether to resize loaded img to gt. :param normalize: whether to normalize the image. :param standardize: whether to standardize the image. :param mlflow: whether to use MLFlow. :param run_name: name of the run. """ snow_imgs = ["LC82271192014287LGN00", "LC81321192014054LGN00"] if img_ids == []: img_ids = None else: snow_imgs = list(set(snow_imgs) & set(img_ids)) dpath, rpath = make_paths(dpath, rpath) rpath = rpath / uuid.uuid4().hex print(f"Working dir: {os.getcwd()}, " + f"artifacts dir: {rpath}", flush=True) mpath = Path(f"/media/ML/mlflow/beetle/artifacts/34/{model_hash}/" # Change init_model to model for old models + "artifacts/init_model/data/model.h5") # WARNING: If ever upgraded to tf2.x, this way of using metrics will not work # because loaded metrics will become MeanMetricWrapper objects in tf2.x and # this script isn't prepared for such objects (because MeanMetricWrapper has state, # as opposed to present stateless metric functions). model = keras.models.load_model( mpath, custom_objects={ "jaccard_index_loss": losses.JaccardIndexLoss(), "jaccard_index_metric": losses.JaccardIndexMetric(), "dice_coeff_metric": losses.DiceCoefMetric(), "recall": losses.recall, "precision": losses.precision, "specificity": losses.specificity, # F1 score is needed for old models # "f1_score": losses.f1_score, "tf": tf, }, ) model.load_weights(f"/media/ML/mlflow/beetle/artifacts/34/{model_hash}/" + "artifacts/best_weights/best_weights") metrics, _ = evaluate_model( model=model, thr=thr, dpath=dpath, rpath=rpath, vids=vids, batch_size=batch_size, bands=bands, bands_names=bands_names, img_ids=img_ids, resize=resize, normalize=normalize, standardize=standardize, mlflow=mlflow, run_name=run_name, ) mean_metrics = {} mean_metrics_snow = {} for key, value in metrics.items(): mean_metrics[key] = np.mean(list(value.values())) mean_metrics_snow[f"snow_{key}"] = np.mean( [value[x] for x in snow_imgs]) print(mean_metrics, mean_metrics_snow) if mlflow: log_metrics(mean_metrics) log_metrics(mean_metrics_snow) log_artifacts(rpath)
if gen_method in ratio_estimator_methods: llr_means = np.mean(llr, axis=0) mlflow.log_metrics({ "theta 0 LLR": llr_means[0], "theta 1 LLR": llr_means[1], }) elif gen_method in score_estimator_methods: score_means = np.mean(scores, axis=0) mlflow.log_metrics({ "theta 0 score": score_means[0], "theta 1 score": score_means[1], }) mlflow.log_artifacts(f"{results_dir}/{gen_method}") ################################# ## Calculating expected limits ## ################################# for flag in include_xsec: _, p_values, best_fit_index, _, _, _ = limits.expected_limits( mode="histo", theta_true=theta_true, grid_ranges=theta_ranges, grid_resolutions=resolutions, include_xsec=flag, luminosity=luminosity, hist_vars=[histogram_var],
def train(logger): """ perform the training routine for a given fold. saves plots and selected parameters to the experiment dir specified in the configs. """ logger.info( 'performing training in {}D over fold {} on experiment {} with model {}' .format(cf.dim, cf.fold, cf.exp_dir, cf.model)) net = model.net(cf, logger).cuda() if hasattr(cf, "optimizer") and cf.optimizer.lower() == "adam": logger.info("Using Adam optimizer.") optimizer = torch.optim.Adam(utils.parse_params_for_optim( net, weight_decay=cf.weight_decay, exclude_from_wd=cf.exclude_from_wd), lr=cf.learning_rate[0]) else: logger.info("Using AdamW optimizer.") optimizer = torch.optim.AdamW(utils.parse_params_for_optim( net, weight_decay=cf.weight_decay, exclude_from_wd=cf.exclude_from_wd), lr=cf.learning_rate[0]) if cf.dynamic_lr_scheduling: scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode=cf.scheduling_mode, factor=cf.lr_decay_factor, patience=cf.scheduling_patience) model_selector = utils.ModelSelector(cf, logger) train_evaluator = Evaluator(cf, logger, mode='train') val_evaluator = Evaluator(cf, logger, mode=cf.val_mode) starting_epoch = 1 # prepare monitoring monitor_metrics = utils.prepare_monitoring(cf) if cf.resume: checkpoint_path = os.path.join(cf.fold_dir, "last_checkpoint") starting_epoch, net, optimizer, monitor_metrics = \ utils.load_checkpoint(checkpoint_path, net, optimizer) logger.info('resumed from checkpoint {} to epoch {}'.format( checkpoint_path, starting_epoch)) logger.info('loading dataset and initializing batch generators...') batch_gen = data_loader.get_train_generators(cf, logger) # Prepare MLFlow best_loss = 1e3 step = 1 mlflow.log_artifacts(cf.exp_dir, "exp") for epoch in range(starting_epoch, cf.num_epochs + 1): logger.info('starting training epoch {}'.format(epoch)) start_time = time.time() net.train() train_results_list = [] bix = 0 seen_pids = [] while True: bix = bix + 1 try: batch = next(batch_gen['train']) except StopIteration: break for pid in batch['pid']: seen_pids.append(pid) # print(f'\rtr. batch {bix}: {batch["pid"]}') tic_fw = time.time() results_dict = net.train_forward(batch) tic_bw = time.time() optimizer.zero_grad() results_dict['torch_loss'].backward() optimizer.step() print( '\rtr. batch {0} (ep. {1}) fw {2:.2f}s / bw {3:.2f} s / total {4:.2f} s || ' .format(bix + 1, epoch, tic_bw - tic_fw, time.time() - tic_bw, time.time() - tic_fw) + results_dict['logger_string'], flush=True, end="") train_results_list.append( ({k: v for k, v in results_dict.items() if k != "seg_preds"}, batch["pid"])) print(f"Seen pids (unique): {len(np.unique(seen_pids))}") print() _, monitor_metrics['train'] = train_evaluator.evaluate_predictions( train_results_list, monitor_metrics['train']) logger.info('generating training example plot.') utils.split_off_process( plot_batch_prediction, batch, results_dict, cf, outfile=os.path.join(cf.plot_dir, 'pred_example_{}_train.png'.format(cf.fold))) train_time = time.time() - start_time logger.info('starting validation in mode {}.'.format(cf.val_mode)) with torch.no_grad(): net.eval() if cf.do_validation: val_results_list = [] val_predictor = Predictor(cf, net, logger, mode='val') while True: try: batch = next(batch_gen[cf.val_mode]) except StopIteration: break if cf.val_mode == 'val_patient': results_dict = val_predictor.predict_patient(batch) elif cf.val_mode == 'val_sampling': results_dict = net.train_forward(batch, is_validation=True) val_results_list.append(({ k: v for k, v in results_dict.items() if k != "seg_preds" }, batch["pid"])) _, monitor_metrics['val'] = val_evaluator.evaluate_predictions( val_results_list, monitor_metrics['val']) best_model_path = model_selector.run_model_selection( net, optimizer, monitor_metrics, epoch) # Save best model mlflow.log_artifacts( best_model_path, os.path.join("exp", os.path.basename(cf.fold_dir), 'best_checkpoint')) # Save logs and plots mlflow.log_artifacts(os.path.join(cf.exp_dir, "logs"), os.path.join("exp", 'logs')) mlflow.log_artifacts( cf.plot_dir, os.path.join("exp", os.path.basename(cf.plot_dir))) # update monitoring and prediction plots monitor_metrics.update({ "lr": { str(g): group['lr'] for (g, group) in enumerate(optimizer.param_groups) } }) # replace tboard metrics with MLFlow #logger.metrics2tboard(monitor_metrics, global_step=epoch) mlflow.log_metric('learning rate', optimizer.param_groups[0]['lr'], cf.num_epochs * cf.fold + epoch) for key in ['train', 'val']: for tag, val in monitor_metrics[key].items(): val = val[ -1] # maybe remove list wrapping, recording in evaluator? if 'loss' in tag.lower() and not np.isnan(val): mlflow.log_metric(f'{key}_{tag}', val, cf.num_epochs * cf.fold + epoch) elif not np.isnan(val): mlflow.log_metric(f'{key}_{tag}', val, cf.num_epochs * cf.fold + epoch) epoch_time = time.time() - start_time logger.info('trained epoch {}: took {} ({} train / {} val)'.format( epoch, utils.get_formatted_duration(epoch_time, "ms"), utils.get_formatted_duration(train_time, "ms"), utils.get_formatted_duration(epoch_time - train_time, "ms"))) batch = next(batch_gen['val_sampling']) results_dict = net.train_forward(batch, is_validation=True) logger.info('generating validation-sampling example plot.') utils.split_off_process(plot_batch_prediction, batch, results_dict, cf, outfile=os.path.join( cf.plot_dir, 'pred_example_{}_val.png'.format( cf.fold))) # -------------- scheduling ----------------- if cf.dynamic_lr_scheduling: scheduler.step(monitor_metrics["val"][cf.scheduling_criterion][-1]) else: for param_group in optimizer.param_groups: param_group['lr'] = cf.learning_rate[epoch - 1] # Save whole experiment to MLFlow mlflow.log_artifacts(cf.exp_dir, "exp")
import tempfile import mlflow from mlflow import log_metric, log_param, log_artifacts, get_artifact_uri, active_run,\ get_tracking_uri, log_artifact if __name__ == "__main__": print("Running {} with tracking URI {}".format(sys.argv[0], get_tracking_uri())) log_param("param1", 5) log_metric("foo", 5) log_metric("foo", 6) log_metric("foo", 7) log_metric("random_int", random.randint(0, 100)) run_id = active_run().info.run_id # Get run metadata & data from the tracking server service = mlflow.tracking.MlflowClient() run = service.get_run(run_id) print("Metadata & data for run with UUID %s: %s" % (run_id, run)) local_dir = tempfile.mkdtemp() message = "test artifact written during run %s within artifact URI %s\n" \ % (active_run().info.run_id, get_artifact_uri()) try: file_path = os.path.join(local_dir, "some_output_file.txt") with open(file_path, "w") as handle: handle.write(message) log_artifacts(local_dir, "some_subdir") log_artifact(file_path, "another_dir") finally: shutil.rmtree(local_dir)
for i in range(num_samples): imgs = [ scopyon.Image(data) for data in numpy.load(inputpath / f"images{i:03d}.npy") ] spots = [ scopyon.analysis.spot_detection(img.as_array(), processes=nproc, min_sigma=min_sigma, max_sigma=max_sigma, threshold=threshold, overlap=overlap) for img in imgs ] spots_ = [] for t, data in zip(timepoints, spots): spots_.extend(([t] + list(row) for row in data)) spots_ = numpy.array(spots_) numpy.save(artifacts / f"spots{i:03d}.npy", spots_) print("{} spots are detected in {} frames.".format(len(spots_), len(imgs))) warnings.resetwarnings() #!ls ./artifacts #log_artifacts("./artifacts") log_artifacts(generated_data) mlflow.end_run()
train_time = timeit.default_timer() os.system('python {} --mode {} --train-csv {} --model-dir {}'.format( train_file, 'regression' if dataset[-1] == 'r' else 'classification', '{}/{}/train.csv'.format(data_dir, dataset), '{}/{}/'.format(result_dir, dataset))) train_time = timeit.default_timer() - train_time pred_time = timeit.default_timer() os.system('python {} --prediction-csv {} --test-csv {} --model-dir {}'. format(predict_file, '{}/{}/pred.csv'.format(result_dir, dataset), '{}/{}/test.csv'.format(data_dir, dataset), '{}/{}/'.format(result_dir, dataset))) pred_time = timeit.default_timer() - pred_time df = pd.read_csv('{}/{}/test-target.csv'.format(data_dir, dataset)) df_pred = pd.read_csv('{}/{}/pred.csv'.format(result_dir, dataset)) df = pd.merge(df, df_pred, on='line_id', left_index=True) score = roc_auc_score(df.target.values, df.prediction.values) if dataset[-1] == 'c' else \ np.sqrt(mean_squared_error(df.target.values, df.prediction.values)) print('Score {:0.5f}'.format(score)) n = dataset.split('_')[1] mlflow.log_metric('score_{}'.format(n), score) mlflow.log_metric('train_time_{}'.format(n), train_time) mlflow.log_metric('test_time_{}'.format(n), pred_time) mlflow.log_artifacts('./')
def rbf_svr_tuning(c = [0.001, 0.01, 0.1, 1, 10], gamma = [0.001, 0.01, 0.1, 1, 10], k = 5, train_data_path = '../data/training_data.csv', save_model = False, tracking_uri = "http://0.0.0.0:5000"): # Log the parameters with mlflow mlflow.log_param("c", c) mlflow.set_tag("k", k) # Set random seed for reproducibility np.random.seed(RANDOM_SEED) random.seed(RANDOM_SEED) # Get data shuffled and split into training and test sets mdr = MiningDataReader(path = train_data_path) (variable_names, X_train, X_test, y_train, y_test) = mdr.get_splitted_data() pipeline = Pipeline(steps = [('scaling', StandardScaler()), ('regression', SVR(kernel = 'rbf'))]) ### TRAINING ### ################ # Generate grid search for hyperparam tuning hyperparams = {} hyperparams['regression__C'] = c hyperparams['regression__gamma'] = gamma print("Training started...\n") # Create an instance of Random Forest Regressor and fit the data for the grid parameters using all processors modelCV = GridSearchCV(estimator = pipeline, param_grid = hyperparams, cv = k, scoring = 'neg_mean_squared_error', n_jobs = -1) with ProgressBar(): modelCV.fit(X_train, y_train) # Iterate over the results storing training error for each hyperparameter combination results = modelCV.cv_results_ param_list, training_err_list, training_dev_list = [], [], [] for i in range(len(results['params'])): param = results['params'][i] score = (-1) * results['mean_test_score'][i] # NEGATIVE MSE std = results['std_test_score'][i] param_list.append(param) training_err_list.append(score) training_dev_list.append(std) print(f"\nBest parameter set found for the training set:\n{modelCV.best_params_}") # Store the index of the best combination best_index = param_list.index(modelCV.best_params_) # Get the best values for hyperparams best_c = modelCV.best_params_['regression__C'] best_gamma = modelCV.best_params_['regression__gamma'] print("\nTraining finished. Evaluating model...\n") ### EVALUATION ### ################## # Criteria is C criteria = 'c' mlflow.set_tag("criteria", criteria) param_values = c # Predict test data variying criteria param and evaluate the models training_err_by_criteria, training_dev_by_criteria, test_err_list = [], [], [] rmse_score, mae_score, r2_score = -1, -1, -1 feature_names, feature_importances = [], [] for param_value in tqdm(param_values): model = Pipeline(steps = [('scaler', StandardScaler()), ('regression', SVR( C = param_value, gamma = best_gamma, kernel = 'rbf'))]) param = {'regression__C': param_value, 'regression__gamma': best_gamma} # Fit model and evaluate results model.fit(X_train, y_train) prediction = model.predict(X_test) index = param_list.index(param) training_err = training_err_list[index] training_dev = training_dev_list[index] (training_mse, test_mse, rmse, mae, r2) = get_test_metrics(training_err, y_test, prediction) # Store metrics training_err_by_criteria.append(training_mse) training_dev_by_criteria.append(training_dev) test_err_list.append(test_mse) # Set aditional metrics for the best combination if index == best_index: rmse_score = rmse mae_score = mae r2_score = r2 # Generate the plots empty_img_folder() plot_errors(criteria, param_values, training_err_by_criteria, training_dev_by_criteria, test_err_list) # Once hyperparameters are selected, train and save the best model if save_model: print("\nEvaluation finished. Training final model with train + test data with the best hyperparameters...") final_model = Pipeline(steps = [('scaler', StandardScaler()), ('regression', SVR( C = param_list[best_index]['regression__C'], gamma = best_gamma, kernel = 'rbf'))]) # Train the best model with all the data (training + test) full_X = np.vstack((X_train, X_test)) full_y = np.concatenate((y_train, y_test)) final_model.fit(full_X, full_y) # Log plots and model with mlflow mlflow.log_artifacts('./img') mlflow.sklearn.log_model(final_model, 'model') # Log results with mlflow mlflow.log_metric("train_mse", training_err_list[best_index]) mlflow.log_metric("test_mse", min(test_err_list)) mlflow.log_metric("rmse", rmse_score) mlflow.log_metric("mae", mae_score) mlflow.log_metric("r2", r2_score) mlflow.set_tag("best_params", param_list[best_index]) # Output the results print(f''' ----------------------------------------------------------------------------------------------------------------------- RESULTS ----------------------------------------------------------------------------------------------------------------------- Best params: {param_list[best_index]} Training MSE: {training_err_list[best_index]} Test MSE: {min(test_err_list)} RMSE: {rmse_score} MAE: {mae_score} R2: {r2_score} ----------------------------------------------------------------------------------------------------------------------- ''')
plt.title('XGBoost Classification ' + metric_name) plt.savefig("artifacts/" + metric_name + ".png") plt.show() ################################## # Plot decision tree, feature importance stack and precision-recall curve # ................................ precision, recall, _ = precision_recall_curve(test.get_label(), predictions) plt.step(recall, precision, color='b', alpha=0.2, where='post') plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision-Recall curve') plt.savefig("artifacts/PR_curve.png") plt.show() plt.rcParams["figure.figsize"] = (14, 7) xgb.plot_importance(clf, grid=False) plt.savefig("artifacts/importance.png") plt.show() plt.rcParams["figure.figsize"] = (14, 3) xgb.plot_tree(clf) plt.title('Decision Tree') plt.savefig("artifacts/tree.png") plt.show() # persist the model and save artficats to mlflow clf.save_model('artifacts/xgb.model') mlflow.log_artifacts("artifacts")
callback=callback, log_interval=args.log_interval) model.save(env.simulator._env_working_dir_parent + '/' + name) # If the algorithm doesn't reset or close the environment, this script will do it in # order to correctly log all the simulation data (Energyplus + Sinergym # logs) if env.simulator._episode_existed: env.close() # ---------------------------------------------------------------------------- # # Mlflow artifacts storege # # ---------------------------------------------------------------------------- # if args.mlflow_store: # Code for send output and tensorboard to mlflow artifacts. mlflow.log_artifacts(local_dir=env.simulator._env_working_dir_parent, artifact_path=name) if args.evaluation: mlflow.log_artifacts(local_dir='best_model/' + name, artifact_path='best_model/' + name) # If tensorboard is active (in local) we should send to mlflow if args.tensorboard and 'gs://' + args.bucket_name not in args.tensorboard: mlflow.log_artifacts(local_dir=args.tensorboard + '/' + name, artifact_path=os.path.abspath( args.tensorboard).split('/')[-1] + '/' + name) # ---------------------------------------------------------------------------- # # Google Cloud Bucket Storage # # ---------------------------------------------------------------------------- # if args.remote_store: # Initiate Google Cloud client
import os from random import random, randint from mlflow import log_metric, log_param, log_artifacts if __name__ == "__main__": # Log a parameter (key-value pair) log_param("param1", randint(0, 100)) # Log a metric; metrics can be updated throughout the run log_metric("foo", random()) log_metric("foo", random() + 1) log_metric("foo", random() + 2) # Log an artifact (output file) if not os.path.exists("outputs"): os.makedirs("outputs") with open("outputs/test.txt", "w") as f: f.write("hello world!") log_artifacts("outputs")
if not is_nni_run_standalone(): # Report final training results to NNI (NNI HP or NNI Classic NAS APIs) # TODO: make sure `valid_state.metrics` is ordered so that reported default metric to NNI is always the same nni.report_final_result({'default': valid_evaluator.state.metrics.values()[0], **train_evaluator.state.metrics, **valid_evaluator.state.metrics}) return (valid_evaluator.state.metrics, state) except Exception as e: logging.error( f'Ignite training loop of "{type(model).__name__}" model failed, exception "{e}" raised{deepcv.utils.NL}### Traceback ###{deepcv.utils.NL}{traceback.format_exc()}') raise RuntimeError(f'Error: `{e}` exception raised during ignite training loop of "{type(model).__name__}" model...') from e finally: if backend_conf.rank == 0: tb_logger.close() if hp['log_output_dir_to_mlflow'] and mlflow.active_run(): logging.info('Logging training output directory as mlfow artifacts...') mlflow.log_artifacts(str(output_path)) # TODO: log and replace artifacts to mlflow at every epochs? # TODO: make sure all artifacts are loaded synchronously here # shutil.rmtree(output_path) def _setup_distributed_training(device, backend_conf: BackendConfig, model: torch.nn.Module, batch_shape: torch.Size, use_sync_batch_norm: bool = False) -> torch.nn.Module: if backend_conf.distributed: # Setup distributed training with `torch.distributed` dist.init_process_group(backend_conf.dist_backend, init_method=backend_conf.dist_url) assert backend_conf.is_cuda, 'Error: Distributed training must be run on GPU(s).' torch.cuda.set_device(backend_conf.device) # TODO: make sure we dont want to add more device IDs here (see distributed examples in Ignite or PyTorch) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[backend_conf.local_rank, ], output_device=backend_conf.local_rank) if use_sync_batch_norm and any(map(model.modules(), lambda m: isinstance(m, (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d)))):