X_train = train_df[args.features.split()] X_test = test_df[args.features.split()] y_train = train_df[args.target] y_test = test_df[args.target] # set remote mlflow server mlflow.set_tracking_uri(args.tracking_uri) mlflow.set_experiment(args.experiment_name) with mlflow.start_run(): params = { "n-estimators": args.n_estimators, "min-samples-leaf": args.min_samples_leaf, "features": args.features } mlflow.log_params(params) # TRAIN logging.info('training model') model = RandomForestRegressor(n_estimators=args.n_estimators, min_samples_leaf=args.min_samples_leaf, n_jobs=-1) model.fit(X_train, y_train) # ABS ERROR AND LOG COUPLE PERF METRICS logging.info('evaluating model') abs_err = np.abs(model.predict(X_test) - y_test) for q in [10, 50, 90]: logging.info(
shuffle=False, collate_fn=pad_sequences, drop_last=False) else: test_dataset = None test_loader = None mlflow.set_experiment(f"diplodatos.{args.language}") with mlflow.start_run(): logging.info("Starting experiment") # Log all relevent hyperparameters mlflow.log_params({ "model_type": "CNN", "embeddings": args.pretrained_embeddings, "dropout": args.dropout, "embeddings_size": args.embeddings_size, "epochs": args.epochs }) device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") logging.info("Building classifier") model = CNNClassifier( pretrained_embeddings_path=args.pretrained_embeddings, token_to_index=args.token_to_index, n_labels=train_dataset.n_labels, dropout=args.dropout, vector_size=args.embeddings_size, freeze_embedings=True, FILTERS_LENGTH=[2, 3, 4, 5],
shuffle=False, collate_fn=pad_sequences, drop_last=False) else: test_dataset = None test_loader = None mlflow.set_experiment(f"diplodatos.{args.language}") with mlflow.start_run(): logging.info("Starting experiment") # Log all relevent hyperparameters mlflow.log_params({ "model_type": "Multilayer Perceptron", "embeddings": args.pretrained_embeddings, "hidden_layers": args.hidden_layers, "dropout": args.dropout, "embeddings_size": args.embeddings_size, "epochs": args.epochs }) device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") logging.info("Building classifier") model = MLPClassifier( pretrained_embeddings_path=args.pretrained_embeddings, token_to_index=args.token_to_index, n_labels=train_dataset.n_labels, hidden_layers=args.hidden_layers, dropout=args.dropout, vector_size=args.embeddings_size, freeze_embedings=True # This can be a hyperparameter
def log_params(cls, params): mlflow.log_params(params)
def train(args, yml_config): with strategy.scope(): # @title Load tensorflow datasets: we use tensorflow flower dataset as an examplegit batch_size = yml_config['finetuning']['batch'] buffer_size = yml_config['finetuning']['buffer_size'] # @title Load tensorflow datasets: we use tensorflow flower dataset as an example dataset_name = yml_config['data_src'] if dataset_name == 'tf_flowers': tfds_dataset, tfds_info = tfds.load(dataset_name, split='train', with_info=True) num_images = tfds_info.splits['train'].num_examples num_classes = tfds_info.features['label'].num_classes x = tfds_dataset.map(_preprocess).batch(batch_size) x = tf1.data.make_one_shot_iterator(x).get_next() elif dataset_name == 'chest_xray': if args.xray_path == '': data_path = yml_config['dataset']['chest_xray'] else: data_path = args.xray_path train_dataset, tfds_info = chest_xray.XRayDataSet(data_path, config=None, train=True) num_images = np.floor( yml_config['finetuning']['train_data_ratio'] * tfds_info['num_examples']) num_classes = tfds_info['num_classes'] print(f"Training: {num_images} images...") def _preprocess(x): x['image'] = preprocess_image(x['image'], 224, 224, is_training=False, color_distort=False) return x x_ds = train_dataset \ .take(num_images) \ .map(_preprocess, deterministic=False) \ .shuffle(buffer_size)\ .batch(yml_config['finetuning']['batch']) x_iter = tf1.data.make_one_shot_iterator(x_ds) x_init = x_iter.make_initializer(x_ds) x = x_iter.get_next() print(f"{type(x)} {type(x['image'])} {x['image']} {x['label']}") # @title Load module and construct the computation graph learning_rate = yml_config['finetuning']['learning_rate'] momentum = yml_config['finetuning']['momentum'] weight_decay = yml_config['finetuning']['weight_decay'] epoch_save_step = yml_config['finetuning']['epoch_save_step'] load_saver = yml_config['finetuning'].get('load_ckpt') # Load the base network and set it to non-trainable (for speedup fine-tuning) hub_path = str( Path(yml_config['finetuning']['pretrained_build']).resolve()) hub_path = os.path.join(hub_path, 'hub') module = hub.Module(hub_path, trainable=yml_config['finetuning']['train_resnet']) if yml_config['finetuning']['pretrained_model'] == 'ChestXRay': key = module(inputs=x['image'], signature="projection-head-1", as_dict=True) else: key = module(inputs=x['image'], as_dict=True) # Attach a trainable linear layer to adapt for the new task. if dataset_name == 'tf_flowers': with tf1.variable_scope('head_supervised_new', reuse=tf1.AUTO_REUSE): logits_t = tf1.layers.dense(inputs=key['default'], units=num_classes, name='proj_head') loss_t = tf1.reduce_mean( input_tensor=tf1.nn.softmax_cross_entropy_with_logits( labels=tf1.one_hot(x['label'], num_classes), logits=logits_t)) elif dataset_name == 'chest_xray': with tf1.variable_scope('head_supervised_new', reuse=tf1.AUTO_REUSE): logits_t = tf1.layers.dense(inputs=key['default'], units=num_classes) cross_entropy = weighted_cel(labels=x['label'], logits=logits_t, bound=3.0) loss_t = tf1.reduce_mean(tf1.reduce_sum(cross_entropy, axis=1)) # Setup optimizer and training op. if yml_config['finetuning']['optimizer'] == 'adam': optimizer = tf1.train.AdamOptimizer(learning_rate) elif yml_config['finetuning']['optimizer'] == 'lars': optimizer = LARSOptimizer(learning_rate, momentum=momentum, weight_decay=weight_decay, exclude_from_weight_decay=[ 'batch_normalization', 'bias', 'head_supervised' ]) else: raise RuntimeError("Optimizer not supported") variables_to_train = tf1.trainable_variables() train_op = optimizer.minimize( loss_t, global_step=tf1.train.get_or_create_global_step(), var_list=variables_to_train) print('Variables to train:', variables_to_train) # Add ops to save and restore all the variables. sess = tf1.Session() Saver = tf1.train.Saver() # Default saves all variables current_time = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') directory = Path(args.output_dir) is_time_to_save_session = partial(model_ckpt.save_session, epoch_save_step, Saver, output=directory) if load_saver is not None: Saver.restore(sess, load_saver) else: sess.run(tf1.global_variables_initializer()) # @title We fine-tune the new *linear layer* for just a few iterations. epochs = yml_config['finetuning']['epochs'] # ===============Tensor board section =============== # with tf.name_scope('performance'): # tf_labels = tf1.placeholder(tf.int32, shape=[batch_size,num_classes], name='accuracy') tf_tot_acc_all_ph = tf1.placeholder(tf.float32, shape=None, name='accuracy_all_labels_ph') tf_tot_acc_all_summary = tf1.summary.scalar('accuracy_all_labels', tf_tot_acc_all_ph) tf_tot_acc_per_class_ph = tf1.placeholder(tf.float32, shape=None, name='accuracy_per_class_ph') tf_tot_acc_per_class_summary = tf1.summary.scalar( 'accuracy_per_class', tf_tot_acc_per_class_ph) tf_tot_acc_class_avg_ph = tf1.placeholder( tf.float32, shape=None, name='accuracy_per_class_averaged_ph') tf_tot_acc_class_avg_summary = tf1.summary.scalar( 'accuracy_per_class_averaged', tf_tot_acc_class_avg_ph) tf_train_tot_loss_ph = tf1.placeholder(tf.float32, shape=None, name='train_tot_loss') tf_train_tot_loss_summary = tf1.summary.scalar('train_tot_loss', tf_train_tot_loss_ph) tf_tot_auc_ph = tf1.placeholder(tf.float32, shape=None, name='auc_ph') tf_tot_auc_ph_summary = tf1.summary.scalar('auc', tf_tot_auc_ph) performance_summaries = tf1.summary.merge([ tf_tot_acc_all_summary, tf_tot_acc_class_avg_summary, tf_train_tot_loss_summary, tf_tot_auc_ph_summary ]) hyper_param = [] print( f"yml_config[pretrained_build]= {yml_config['finetuning']['pretrained_build']} " ) for item in yml_config['finetuning']: hyper_param.append( tf1.summary.text( str(item), tf.constant(str(yml_config['finetuning'][item])), 'HyperParam')) summ_writer = tf1.summary.FileWriter(directory / 'tb', sess.graph) tf.summary.record_if(yml_config['tensorboard']) # Limit the precision of floats... np.set_printoptions(formatter={'float': '{: 0.3f}'.format}) with sess.as_default() as scope: if yml_config['mlflow']: # log params in MLFLOW if args.mlflow_dir is None: mlflow.set_tracking_uri(yml_config['mlflow_path']) else: mlflow.set_tracking_uri(args.mlflow_dir) mlflow.set_experiment('results') mlflow.start_run() # open pickle file that contains the hyper params of pretuned fname = os.path.join( yml_config['finetuning']['pretrained_build'], 'experiment_flags.p') if os.path.exists(fname): with open(fname, 'rb') as f: pretuned_params = pickle.load(f) pretuned_params = { 'P-' + str(key).replace('/', '').replace( '?', '').replace('$', ''): val for key, val in pretuned_params.items() } mlflow.log_params(pretuned_params) # open pickle file that contains the hyper params of pretuned fname = os.path.join( yml_config['finetuning']['pretrained_build'], 'mAP_result.p') if os.path.exists(fname): with open(fname, 'rb') as f: pretuned_metric = pickle.load(f) mlflow.log_metrics(pretuned_metric) finetuned_params = { 'F-' + str(key).replace('/', ''): val for key, val in yml_config['finetuning'].items() } mlflow.log_param('TB_Timestamp', current_time) mlflow.log_params(finetuned_params) fname = os.path.join(directory, 'finetuning_hyper_params.txt') with open(fname, 'w') as f: for key, value in yml_config['finetuning'].items(): f.write('%s:%s\n' % (key, value)) writer = tf1.summary.FileWriter('./log', sess.graph) for index, summary_op in enumerate(hyper_param): text = sess.run(summary_op) summ_writer.add_summary(text, index) n_iter = int(num_images / batch_size) print(f"Batch:{batch_size}, n_iter:{n_iter} ") # =============== Main Loop (epoch) - START =============== for it in range(epochs): start_time_epoch = time.time() # Init dataset iterator sess.run(x_init) # Accuracy all = All class must be Correct # Accuracy per class = Score for each class # Accuracy class average: the average of the accuracy per class tot_acc_all = 0.0 tot_acc_per_class = 0.0 tot_acc_class_avg = 0.0 train_tot_loss = 0.0 epoch_acc_all = 0.0 epoch_acc_per_class = 0.0 epoch_acc_class_avg = 0.0 #show_one_image(x['image'][0].eval()) # =============== Main Loop (iteration) - START =============== all_labels = [] all_logits = [] for step in range(n_iter): start_time_iter = time.time() _, loss, image, logits, labels = sess.run( fetches=(train_op, loss_t, x['image'], logits_t, x['label'])) # tf_labels = tf.convert_to_tensor(labels) train_tot_loss += loss all_labels.extend(labels) if dataset_name == 'tf_flowers': pred = logits.argmax(-1) correct = np.sum(pred == labels) acc_per_class = np.array([correct / float(batch_size)]) elif dataset_name == 'chest_xray': # # New compute logits_sig = scipy.special.expit(logits) all_logits.extend(logits_sig) pred = (logits_sig > 0.5).astype(np.float32) acc_all = np.mean( np.min(np.equal(pred, labels).astype(np.float32), axis=1)) acc_per_class = np.mean(np.equal(pred, labels).astype( np.float32), axis=0) acc_class_avg = np.mean(acc_per_class) tot_acc_all += acc_all tot_acc_per_class += acc_per_class tot_acc_class_avg += acc_class_avg #The function roc_auc_score can result in a error (ValueError: Only one class present in y_true. # ROC AUC score is not defined in that) . The error occurred when each label has only one class # in the batch. For example, if all the samples in the batch has hernia +1, the error will occurred.I try: auc_cum = roc_auc_score(np.array(all_labels), np.array(all_logits)) except: auc_cum = None current_time_iter = time.time() elapsed_time_iter = current_time_iter - start_time_iter if yml_config['finetuning']['verbose_train_loop']: print( f"[Epoch {it + 1}/{epochs} Iter: {step}/{n_iter}] Model: {yml_config['finetuning']['pretrained_model']}, Total Loss: {train_tot_loss} Loss: {np.float32(loss)}" # Batch Acc: {np.float32(acc_all)} " f" AUC Cumulative: {auc_cum}") print(f"Finished iteration:{step} in: " + str(int(elapsed_time_iter)) + " sec") # break if logits explose if np.isnan(np.sum(logits)): print(f"Loss has exploded: Nan") break epoch_acc_all = (tot_acc_all / n_iter) epoch_acc_per_class = (tot_acc_per_class / n_iter) epoch_acc_class_avg = (tot_acc_class_avg / n_iter) try: epoch_auc = roc_auc_score(np.array(all_labels), np.array(all_logits), average=None) epoch_auc_mean = epoch_auc.mean() aucs = dict(zip(chest_xray.XR_LABELS.keys(), epoch_auc)) auc_scores = { 'AUC ' + str(key): val for key, val in aucs.items() } except: epoch_auc = None epoch_auc_mean = None print( f"[Epoch {it + 1}/{epochs} Model: {yml_config['finetuning']['pretrained_model']}, Loss: {train_tot_loss} " f" Train AUC: {epoch_auc_mean} AOC/Class {epoch_auc},") # Is it time to save the session? is_time_to_save_session(it, sess) current_time_epoch = time.time() elapsed_time_iter = current_time_epoch - start_time_epoch print(f"Finished EPOCH:{it + 1} in: " + str(int(elapsed_time_iter)) + " sec") # ===================== Write Tensorboard summary =============================== # Execute the summaries defined above summ = sess.run(performance_summaries, feed_dict={ tf_tot_acc_all_ph: epoch_acc_all, tf_tot_acc_class_avg_ph: epoch_acc_class_avg, tf_train_tot_loss_ph: train_tot_loss, tf_tot_auc_ph: epoch_auc_mean }) # Write the obtained summaries to the file, so it can be displayed in the TensorBoard summ_writer.add_summary(summ, it) # =============== Main Loop (epoch) - END =============== print(f"Training Done") if yml_config['mlflow']: mlflow.log_metric('Total Train Accuracy', epoch_acc_all) mlflow.log_metric('Total Train Accuracy per class', np.mean(epoch_acc_per_class)) mlflow.log_metric('Total Train Loss', train_tot_loss) if epoch_auc is not None: mlflow.log_metrics(auc_scores) fname_final = str(directory / f'final.ckpt') ckpt_pt = Saver.save(sess=sess, save_path=fname_final) print(f"Final Chekpoint Saved in {fname_final}") return directory
solver=config.get('hyperparams').get('log_reg').get('solver'), random_state=config.get('random_state'), max_iter=config.get('hyperparams').get('log_reg').get('max_iter'), tol=config.get('hyperparams').get('log_reg').get('tol') ) ) param_grid = { 'model__penalty': config.get('hyperparams').get('log_reg').get('penalty'), 'model__C': config.get('hyperparams').get('log_reg').get('C') } gs_clf = GridSearchCV( pipeline, param_grid=param_grid, scoring=config.get('scoring'), cv=config.get('k'), verbose=1 ) gs_clf.fit(X_train, y_train) mlflow.log_params(gs_clf.best_params_) mlflow.log_metrics({ '_'.join([config.get('scoring'), 'train']): gs_clf.score(X_train, y_train), '_'.join([config.get('scoring'), 'test']): gs_clf.score(X_test, y_test) }) print('Plotting ROC curve...') img_fn = 'roc_{}.png'.format(RUN_NAME) plot_roc_curve(gs_clf, X_test, y_test, img_fn) # print('Plotting learning curve...') # img_fn = 'learning_curve_{}.png'.format(RUN_NAME) # plot_learning_curve(gs_clf, X_train, y_train, img_fn) print('Recording hyperparameters used...') txt_fn = 'hyperparams_{}.txt'.format(RUN_NAME)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--output_dir", default="output", type=str, help="The output directory where the model predictions and checkpoints " "will be written.") parser.add_argument("--bert_embeddings", action='store_true', help="Whether to use roberta embeddings.") parser.add_argument("--train", action='store_true', help="Whether to run training.") parser.add_argument("--eval", action='store_true', help="Whether to run eval on the test set.") parser.add_argument("--dataset_name", type=str) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=32, help="Number of updates steps to accumulate before performing a " "backward/update pass.") parser.add_argument("--weight_decay", default=1e-5, type=float, help="Weight deay if we apply some.") parser.add_argument("--dataloader_workers", default=16, type=int) parser.add_argument("--num_epochs", default=40, type=int, help="Total number of training epochs to perform.") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--batch_size", default=128, type=int, help="Batch size per GPU/CPU.") parser.add_argument("--batch_size_eval", default=256, type=int, help="Batch size per GPU/CPU.") parser.add_argument("--run_name", type=str, help="name of the mlflow run") parser.add_argument("--overwrite_output_dir", action='store_true', help="Overwrite the content of the output directory") args = parser.parse_args() mlflow.set_experiment("article2image") mlflow.start_run(run_name=args.run_name) mlflow.log_params(vars(args)) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and not args.overwrite_output_dir and args.train: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to " "overcome.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) best_model_img = os.path.join(args.output_dir, "best_model_img.model") best_model_text = os.path.join(args.output_dir, "best_model_text.model") set_seed(args.seed) n_gpu = torch.cuda.device_count() batch_size = args.batch_size * max(1, n_gpu) batch_size_eval = args.batch_size * max(1, n_gpu) test_dataloader = torch.utils.data.DataLoader(dataset=MyDataset( 'val', args.dataset_name, args.bert_embeddings), batch_size=batch_size_eval, shuffle=False, num_workers=4) input_size = 768 if args.bert_embeddings else 512 img_model = torch.nn.DataParallel(ImageProjectModel()).cuda() text_model = torch.nn.DataParallel(TextProjectModel(input_size)).cuda() if args.train: train_dataloader = torch.utils.data.DataLoader( dataset=MyDataset('train', args.dataset_name, args.bert_embeddings), batch_size=batch_size, shuffle=True, num_workers=args.dataloader_workers) optimizer = torch.optim.Adam(params=itertools.chain( img_model.parameters(), text_model.parameters()), lr=args.learning_rate, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', verbose=True, patience=5) itr = 0 triplet_loss = TripletLoss() best_loss = sys.maxsize for e in tqdm(range(1, args.num_epochs + 1), ascii=True, desc='Epoch'): img_model.train() text_model.train() with tqdm(total=len(train_dataloader), ascii=True, leave=False, desc='iter') as pbar: for i, (images, articles_ids, articles_mask) in enumerate(train_dataloader): itr += 1 image_projections = img_model(images.float().cuda()) article_projections = text_model(articles_ids.cuda(), articles_mask.cuda()) loss = triplet_loss(image_projections, article_projections) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if (i + 1) % args.gradient_accumulation_steps == 0: loss.backward() optimizer.step() optimizer.zero_grad() if itr % 100 == 0: mlflow.log_metric( "training loss", loss.item() / max((len(image_projections) * len(article_projections) - len(image_projections)), 1), itr) pbar.update() img_model.eval() text_model.eval() losses = [] with tqdm(total=len(test_dataloader), ascii=True, leave=False, desc='eval') as pbar, torch.no_grad(): for i, (images, articles_ids, articles_mask) in enumerate(test_dataloader): with torch.no_grad(): image_projections = img_model(images.float().cuda()) article_projections = text_model( articles_ids.cuda(), articles_mask.cuda()) loss = triplet_loss(image_projections, article_projections) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps losses.append(loss.item() / max( (len(image_projections) * len(article_projections) - len(image_projections)), 1)) pbar.update() test_loss = np.mean(losses) mlflow.log_metric("test loss", test_loss, e) scheduler.step(test_loss) # save only the best model if test_loss < best_loss: best_loss = test_loss if os.path.exists(best_model_img): os.remove(best_model_img) if os.path.exists(best_model_text): os.remove(best_model_text) torch.save(img_model.state_dict(), best_model_img) torch.save(text_model.state_dict(), best_model_text) if args.eval: img_model.load_state_dict(torch.load(best_model_img)) text_model.load_state_dict(torch.load(best_model_text)) embeddings_img, embeddings_cap = compute_embeddings( img_model, text_model, test_dataloader, batch_size_eval) recall = t2i(embeddings_img, embeddings_cap) avg_recall = (recall[0] + recall[1] + recall[2]) / 3 print("Average t2i Recall: %.1f" % avg_recall) print("Text to image: %.1f %.1f %.1f %.1f %.1f" % recall) mlflow.log_metric("R1", recall[0]) mlflow.log_metric("R5", recall[1]) mlflow.log_metric("R10", recall[2]) mlflow.log_metric("MdR", recall[3]) mlflow.log_metric("MeR", recall[4])
def log_parameters(ax_experiment): arm_name = ax_experiment.fetch_data().df.iloc[-1, :]['arm_name'] arm = ax_experiment.arms_by_name[arm_name] mlflow.log_params(arm.parameters)
def log_params(cls, params): try: mlflow.log_params(params) except ConnectionError: logger.warning("ConnectionError in logging params to MLFlow")
def run(self, args: argparse.Namespace) -> None: logger.info("Load config from %s", args.config) config = load_yaml(minato.cached_path(args.config), args.overrides) logger.info("Configuration: %s", str(config)) builder = ConfigBuilder.build(config) model = builder.model train_file = args.train or builder.train_file validation_file = args.validation or builder.validation_file if not train_file: raise ConfigurationError("train file is required.") logger.info("Start training...") logger.info("Training data: %s", str(train_file)) logger.info("Validation data: %s", str(validation_file)) params = { "command": " ".join(sys.argv), "config_file": args.config, "train_file": train_file, "validation_file": validation_file, "serialization_dir": args.serialization_dir, "config": config, } with _mlflow_start_run(): serialization_dir = args.serialization_dir if args.serialization_dir is None and mlflow is None: serialization_dir = "./output" with create_workdir( serialization_dir, exist_ok=args.force, ) as workdir: workdir = workdir.absolute() try: with open(workdir / "config.yaml", "w") as f: yaml.dump(config, f) with open(workdir / "params.json", "w") as f: json.dump(params, f, indent=2) if mlflow is not None: logger.info("Log params to mlflow") mlflow.log_params(params) metrics = model.train(train_file, validation_file, workdir) if mlflow is not None: logger.info("Log metrics to mlflow") mlflow.log_metrics(metrics) logger.info("Training completed") logger.info("Training metrics: %s", json.dumps(metrics, indent=2)) with open(workdir / "metrics.json", "w") as metrics_file: json.dump(metrics, metrics_file) with open(workdir / "model.pkl", "wb") as model_file: pickle.dump(model, model_file) finally: if mlflow is not None: logger.info("Log metrics to mlflow") mlflow.log_artifacts(str(workdir)) logger.info("Done!")
def train( model: str, experiment_name: str = None, data_dir=None, root_dir=None, best_metric="val_accuracy", **kwargs, ): """Base method to train a model. Will train the model input based on `MODEL_DICT` correspondance, and define the `experiment_name` in MlFlow tracking. Args: model (str): the model to train. Only two choices: `model1` or `model2`. experiment_name (str, optional): The experiment name to define in MlFlow tracking server. Defaults to None. If None, will be define with `model` value. best_metric (str, optional): The metrics on which performing evaluation of the model, and to check if performance has improved since best last model. Defaults to "val_accuracy". """ _check_input(model) if experiment_name is None: experiment_name = model owd = os.getcwd() root_dir = Paths(root_dir=root_dir).root_dir os.chdir(root_dir) mlflow.set_experiment(experiment_name) tracker = MlFlowTracker(root_dir=root_dir) print(tracker.root_dir) timestamp = time.strftime("%Y%m%d%H%M") run_name = f"{experiment_name}_{timestamp}" learner = MODEL_DICT.get(model)(data_dir=data_dir) print(learner.name) version = tracker.get_new_version(experiment_name) logging.info(version) with mlflow.start_run(run_name=run_name): run_uuid = mlflow.active_run().info.run_uuid logging.info(f"MLflow Run ID: {run_uuid}") learner.train(**kwargs) # Get training params params = learner.get_params() # Log parameters mlflow.log_params(params) # calculate metrics metrics = {} for metric in learner.metrics: metrics[metric] = learner.history[metric][-1] metrics[f"val_{metric}"] = learner.history[f"val_{metric}"][-1] metrics["loss"] = learner.history["loss"][-1] metrics["val_loss"] = learner.history["val_loss"][-1] final_metric = metrics.get(best_metric) # log metrics mlflow.log_metrics(metrics) # log model model_name = learner.model.name X_train = learner.X_train y_pred = learner.predict(X_train) signature = infer_signature(X_train, y_pred) mlflow.keras.log_model(learner.model.model, model_name, signature=signature, save_format="tf") models_path = Paths(root_dir=root_dir).model / "models" if not models_path.exists(): models_path.mkdir() final_metric_best = tracker.get_best_model_metric(experiment_name, metric=best_metric) if final_metric >= final_metric_best: logging.info( "Best model found. Saving to model dir to use with Tensorflow Serving" ) model_path = os.path.join(str(models_path), model) if not os.path.exists(model_path): os.mkdir(model_path) logging.info(f"Folder ") if model == "model2": tfmodel = TFModel(learner.model.model) tf.saved_model.save( tfmodel.model, os.path.join(model_path, "0"), signatures={"serving_default": tfmodel.prediction}, ) print(tfmodel) else: learner.model.model.save(os.path.join(model_path, "0")) logging.info(f"Model exported at {model_path}.") else: logging.info( f"Model logged but best performance not improved for experiment {experiment_name} (current version: {version})." ) os.chdir(owd)
def run_training_with_mlflow(mlflow_conf, wrapped_model, train_dataloader, val_dataloader=None, test_dataloader=None, **kwargs): """ Function to run supervised training for classifcation Parameters ---------- mlflow_conf: dict mlflow configuration e,g, MLFLOW_URI wrapped_model: SKModel wrapped SKModel train_dataloader: training dataloader val_dataloader: validation dataloader, optional test_dataloader: optional kwargs: dict of dicts, optional can contain `artifacts` to log with models, `model_path` to specify model output path, and remianing used as experiment tags Returns ------- tuple: (run_id, run_metrics, val_y, val_yhat, val_pred_proba, test_y, test_yhat, test_pred_proba) """ tune = kwargs.get('tune', False) if tune: inner_cv = kwargs.get('inner_cv', C.DEFAULT_CV) h_search = kwargs.pop('h_search', None) if h_search is None: raise AttributeError(f'if tuner is requested, h_search should be provided') scoring = kwargs.get('scoring', C.DEFAULT_SCORING_CLASSIFIER) model_path = kwargs.pop('model_path', 'model') # model_save_dir = Path(kwargs.get('model_save_dir', C.MODEL_SAVE_DIR)) # model_save_dir.mkdir(parents=True, exist_ok=True) artifacts = kwargs.pop('artifacts', dict()) mlflow_conf.setdefault('problem_type', 'classifier') mlflow_setup = setup_mlflow(**mlflow_conf) calculate_metrics = Metrics(mlflow_conf['problem_type']) log.debug(f"Mlflow setup: {mlflow_setup}") log.debug(f"Used metrics: {calculate_metrics}") experiment_name = mlflow_setup['experiment_name'] experiment_tags = dict() experiment_tags.update(**kwargs) with mlflow.start_run(): run_id = mlflow.active_run().info.run_id _start_time = time.time() X_train, y_train = train_dataloader.get_data() if val_dataloader is not None: X_val, y_val = val_dataloader.get_data() outer_cv, _X, _y = get_predefined_split(X_train, y_train, X_val, y_val) else: warnings.warn("This path is untested...use with caution") outer_cv = kwargs.get('outer_cv', None) if outer_cv is None: warnings.warn(f'Neither validation, nor outer_cv provided. using KFold({C.DEFAULT_CV}) to get validation split') outer_cv = KFold(C.DEFAULT_CV) _X = X_train.values if hasattr(X_train, 'values') else X_train _y = y_train.values if hasattr(y_train, 'values') else y_train if test_dataloader is not None: X_test, y_test = test_dataloader.get_data() # mlflow.log_params(wrapped_model.model.get_params()) if tune: m, gs = wrapped_model.tune(X=_X, y=_y, hyper_params=h_search, cv=inner_cv, experiment_name=experiment_name, scoring=scoring) mlflow.sklearn.log_model(m, experiment_name + '_model') mlflow.sklearn.log_model(gs, experiment_name + '_GridSearchCV') log.info(f"Experiment: {experiment_name} has finished hyperparameter tuning") log.info("Hyperparameter search space: " + str(h_search)) # log params mlflow.log_params(wrapped_model.params) print(f"Best_params:\n {gs.best_params_}") else: wrapped_model.fit(X=X_train, y=y_train)#, Xstd = X_train_std) mlflow.sklearn.log_model(wrapped_model.model, experiment_name + '_model') mlflow.log_params(wrapped_model.params) log.info(f"Experiment: {experiment_name} has finished training") for split_id, (train_index, val_index) in enumerate(outer_cv.split(_X, _y)): if split_id >= 1: warnings.warn("Current logic for tune and implicit outer_cv not correct") break _X_train, _X_val = _X[train_index, :], _X[val_index, :] _y_train, _y_val = _y[train_index], _y[val_index] y_val_proba = wrapped_model.predict_proba(_X_val) if y_val_proba.ndim > 1: y_val_proba = y_val_proba[:,1] y_val_hat = wrapped_model.predict(_X_val) val_score = wrapped_model.score(_X_val, _y_val) if test_dataloader is not None: y_test_proba = wrapped_model.predict_proba(X_test) if y_test_proba.ndim > 1: y_test_proba = y_test_proba[:, 1] y_test_hat = wrapped_model.predict(X_test) test_score = wrapped_model.score(X_test, y_test) else: y_test=None y_test_hat=None y_test_proba=None test_score =None # Calculate metrics wrapped_model.metrics = calculate_metrics(y_val=y_val, y_val_proba=y_val_proba, y_val_hat=y_val_hat, val_score=val_score, y_test=y_test, y_test_proba=y_test_proba, y_test_hat=y_test_hat, test_score=test_score ) _end_time = time.time() run_time = (_end_time - _start_time) # log metrics mlflow.log_metrics(wrapped_model.metrics) experiment_tags.update(dict(run_time=run_time)) if experiment_tags is not None: mlflow.set_tags(experiment_tags) # Other artifacts _tmp = {f"artifact/{art_name}": art_val for art_name, art_val in six.iteritems(artifacts)} helper.log_artifacts(_tmp, run_id, mlflow_uri=mlflow_setup['mlflow_uri'], delete=True) return (run_id, wrapped_model.metrics, y_val, y_val_hat, y_val_proba, y_test, y_test_hat, y_test_proba, )
def mlflow_callback(study, trial): trial_value = trial.value if trial.value is not None else float("nan") with mlflow.start_run(run_name=study.study_name): mlflow.log_params(trial.params) mlflow.log_metrics({"mean_squared_error": trial_value})
def before_pipeline_run(self, run_params: Dict[str, Any]) -> None: """Hook implementation to start an MLflow run with the same run_id as the Kedro pipeline run. """ mlflow.start_run(run_name=run_params["run_id"]) mlflow.log_params(run_params)
def main(params: dict): """ Identify the class to which each image belongs. :param params: (dict) Parameters found in the yaml config file. """ since = time.time() # MANDATORY PARAMETERS img_dir_or_csv = get_key_def('img_dir_or_csv_file', params['inference'], expected_type=str) state_dict = get_key_def('state_dict_path', params['inference']) task = get_key_def('task', params['global'], expected_type=str) if task not in ['classification', 'segmentation']: raise ValueError( f'Task should be either "classification" or "segmentation". Got {task}' ) model_name = get_key_def('model_name', params['global'], expected_type=str).lower() num_classes = get_key_def('num_classes', params['global'], expected_type=int) num_bands = get_key_def('number_of_bands', params['global'], expected_type=int) chunk_size = get_key_def('chunk_size', params['inference'], default=512, expected_type=int) BGR_to_RGB = get_key_def('BGR_to_RGB', params['global'], expected_type=bool) # OPTIONAL PARAMETERS dontcare_val = get_key_def("ignore_index", params["training"], default=-1, expected_type=int) num_devices = get_key_def('num_gpus', params['global'], default=0, expected_type=int) default_max_used_ram = 25 max_used_ram = get_key_def('max_used_ram', params['global'], default=default_max_used_ram, expected_type=int) max_used_perc = get_key_def('max_used_perc', params['global'], default=25, expected_type=int) scale = get_key_def('scale_data', params['global'], default=[0, 1], expected_type=List) debug = get_key_def('debug_mode', params['global'], default=False, expected_type=bool) raster_to_vec = get_key_def('ras2vec', params['inference'], False) # benchmark (ie when gkpgs are inputted along with imagery) dontcare = get_key_def("ignore_index", params["training"], -1) targ_ids = get_key_def('target_ids', params['sample'], None, expected_type=List) # SETTING OUTPUT DIRECTORY working_folder = Path( params['inference']['state_dict_path']).parent.joinpath( f'inference_{num_bands}bands') Path.mkdir(working_folder, parents=True, exist_ok=True) # mlflow logging mlflow_uri = get_key_def('mlflow_uri', params['global'], default=None, expected_type=str) if mlflow_uri and not Path(mlflow_uri).is_dir(): warnings.warn(f'Mlflow uri path is not valid: {mlflow_uri}') mlflow_uri = None # SETUP LOGGING import logging.config # See: https://docs.python.org/2.4/lib/logging-config-fileformat.html if mlflow_uri: log_config_path = Path('utils/logging.conf').absolute() logfile = f'{working_folder}/info.log' logfile_debug = f'{working_folder}/debug.log' console_level_logging = 'INFO' if not debug else 'DEBUG' logging.config.fileConfig(log_config_path, defaults={ 'logfilename': logfile, 'logfilename_debug': logfile_debug, 'console_level': console_level_logging }) # import only if mlflow uri is set from mlflow import log_params, set_tracking_uri, set_experiment, start_run, log_artifact, log_metrics if not Path(mlflow_uri).is_dir(): logging.warning( f"Couldn't locate mlflow uri directory {mlflow_uri}. Directory will be created." ) Path(mlflow_uri).mkdir() set_tracking_uri(mlflow_uri) exp_name = get_key_def('mlflow_experiment_name', params['global'], default='gdl-inference', expected_type=str) set_experiment(f'{exp_name}/{working_folder.name}') run_name = get_key_def('mlflow_run_name', params['global'], default='gdl', expected_type=str) start_run(run_name=run_name) log_params(params['global']) log_params(params['inference']) else: # set a console logger as default logging.basicConfig(level=logging.DEBUG) logging.info( 'No logging folder set for mlflow. Logging will be limited to console' ) if debug: logging.warning( f'Debug mode activated. Some debug features may mobilize extra disk space and ' f'cause delays in execution.') # Assert that all items in target_ids are integers (ex.: to benchmark single-class model with multi-class labels) if targ_ids: for item in targ_ids: if not isinstance(item, int): raise ValueError( f'Target id "{item}" in target_ids is {type(item)}, expected int.' ) logging.info(f'Inferences will be saved to: {working_folder}\n\n') if not (0 <= max_used_ram <= 100): logging.warning( f'Max used ram parameter should be a percentage. Got {max_used_ram}. ' f'Will set default value of {default_max_used_ram} %') max_used_ram = default_max_used_ram # AWS bucket = None bucket_file_cache = [] bucket_name = get_key_def('bucket_name', params['global']) # list of GPU devices that are available and unused. If no GPUs, returns empty dict gpu_devices_dict = get_device_ids(num_devices, max_used_ram_perc=max_used_ram, max_used_perc=max_used_perc) if gpu_devices_dict: logging.info( f"Number of cuda devices requested: {num_devices}. Cuda devices available: {gpu_devices_dict}. " f"Using {list(gpu_devices_dict.keys())[0]}\n\n") device = torch.device( f'cuda:{list(range(len(gpu_devices_dict.keys())))[0]}') else: logging.warning( f"No Cuda device available. This process will only run on CPU") device = torch.device('cpu') # CONFIGURE MODEL num_classes_backgr = add_background_to_num_class(task, num_classes) model, loaded_checkpoint, model_name = net(model_name=model_name, num_bands=num_bands, num_channels=num_classes_backgr, dontcare_val=dontcare_val, num_devices=1, net_params=params, inference_state_dict=state_dict) try: model.to(device) except RuntimeError: logging.info(f"Unable to use device 0") device = torch.device(f'cuda' if gpu_devices_dict else 'cpu') model.to(device) # CREATE LIST OF INPUT IMAGES FOR INFERENCE list_img = list_input_images(img_dir_or_csv, bucket_name, glob_patterns=["*.tif", "*.TIF"]) # VALIDATION: anticipate problems with imagery and label (if provided) before entering main for loop valid_gpkg_set = set() for info in tqdm(list_img, desc='Validating imagery'): # validate_raster(info['tif'], num_bands, meta_map) if 'gpkg' in info.keys( ) and info['gpkg'] and info['gpkg'] not in valid_gpkg_set: validate_num_classes(vector_file=info['gpkg'], num_classes=num_classes, attribute_name=info['attribute_name'], ignore_index=dontcare, target_ids=targ_ids) assert_crs_match(info['tif'], info['gpkg']) valid_gpkg_set.add(info['gpkg']) logging.info('Successfully validated imagery') if valid_gpkg_set: logging.info('Successfully validated label data for benchmarking') if task == 'classification': classifier( params, list_img, model, device, working_folder ) # FIXME: why don't we load from checkpoint in classification? elif task == 'segmentation': gdf_ = [] gpkg_name_ = [] # TODO: Add verifications? if bucket: bucket.download_file( loaded_checkpoint, "saved_model.pth.tar") # TODO: is this still valid? model, _ = load_from_checkpoint("saved_model.pth.tar", model) else: model, _ = load_from_checkpoint(loaded_checkpoint, model) # LOOP THROUGH LIST OF INPUT IMAGES for info in tqdm(list_img, desc='Inferring from images', position=0, leave=True): with start_run(run_name=Path(info['tif']).name, nested=True): img_name = Path(info['tif']).name local_gpkg = Path( info['gpkg'] ) if 'gpkg' in info.keys() and info['gpkg'] else None gpkg_name = local_gpkg.stem if local_gpkg else None if bucket: local_img = f"Images/{img_name}" bucket.download_file(info['tif'], local_img) inference_image = f"Classified_Images/{img_name.split('.')[0]}_inference.tif" if info['meta']: if info['meta'] not in bucket_file_cache: bucket_file_cache.append(info['meta']) bucket.download_file(info['meta'], info['meta'].split('/')[-1]) info['meta'] = info['meta'].split('/')[-1] else: # FIXME: else statement should support img['meta'] integration as well. local_img = Path(info['tif']) Path.mkdir(working_folder.joinpath(local_img.parent.name), parents=True, exist_ok=True) inference_image = working_folder.joinpath( local_img.parent.name, f"{img_name.split('.')[0]}_inference.tif") temp_file = working_folder.joinpath( local_img.parent.name, f"{img_name.split('.')[0]}.dat") raster = rasterio.open(local_img, 'r') logging.info(f'Reading original image: {raster.name}') inf_meta = raster.meta label = None if local_gpkg: logging.info(f'Burning label as raster: {local_gpkg}') local_img = clip_raster_with_gpkg(raster, local_gpkg) raster.close() raster = rasterio.open(local_img, 'r') logging.info(f'Reading clipped image: {raster.name}') inf_meta = raster.meta label = vector_to_raster( vector_file=local_gpkg, input_image=raster, out_shape=(inf_meta['height'], inf_meta['width']), attribute_name=info['attribute_name'], fill=0, # background value in rasterized vector. target_ids=targ_ids) if debug: logging.debug( f'Unique values in loaded label as raster: {np.unique(label)}\n' f'Shape of label as raster: {label.shape}') pred, gdf = segmentation(param=params, input_image=raster, label_arr=label, num_classes=num_classes_backgr, gpkg_name=gpkg_name, model=model, chunk_size=chunk_size, device=device, scale=scale, BGR_to_RGB=BGR_to_RGB, tp_mem=temp_file, debug=debug) if gdf is not None: gdf_.append(gdf) gpkg_name_.append(gpkg_name) if local_gpkg: pixelMetrics = ComputePixelMetrics(label, pred, num_classes_backgr) log_metrics(pixelMetrics.update(pixelMetrics.iou)) log_metrics(pixelMetrics.update(pixelMetrics.dice)) pred = pred[np.newaxis, :, :].astype(np.uint8) inf_meta.update({ "driver": "GTiff", "height": pred.shape[1], "width": pred.shape[2], "count": pred.shape[0], "dtype": 'uint8', "compress": 'lzw' }) logging.info( f'Successfully inferred on {img_name}\nWriting to file: {inference_image}' ) with rasterio.open(inference_image, 'w+', **inf_meta) as dest: dest.write(pred) del pred try: temp_file.unlink() except OSError as e: logging.warning(f'File Error: {temp_file, e.strerror}') if raster_to_vec: start_vec = time.time() inference_vec = working_folder.joinpath( local_img.parent.name, f"{img_name.split('.')[0]}_inference.gpkg") ras2vec(inference_image, inference_vec) end_vec = time.time() - start_vec logging.info( 'Vectorization completed in {:.0f}m {:.0f}s'.format( end_vec // 60, end_vec % 60)) if len(gdf_) >= 1: if not len(gdf_) == len(gpkg_name_): raise ValueError('benchmarking unable to complete') all_gdf = pd.concat( gdf_) # Concatenate all geo data frame into one geo data frame all_gdf.reset_index(drop=True, inplace=True) gdf_x = gpd.GeoDataFrame(all_gdf) bench_gpkg = working_folder / "benchmark.gpkg" gdf_x.to_file(bench_gpkg, driver="GPKG", index=False) logging.info( f'Successfully wrote benchmark geopackage to: {bench_gpkg}') # log_artifact(working_folder) time_elapsed = time.time() - since logging.info('Inference Script completed in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60))
def evaluate_model( model: keras.Model, thr: float, dpath: Path, rpath: Path, vids: Tuple[str], batch_size: int, bands: Tuple[int] = (4, 3, 2, 5), bands_names: Tuple[str] = ("red", "green", "blue", "nir"), img_ids: List[str] = None, resize: bool = False, normalize: bool = True, standardize: bool = False, metric_fns: List[Callable] = [ normalized_mutual_info_score, adjusted_rand_score ], mlflow: bool = False, run_name: str = None, ) -> Tuple[Dict, List]: """ Get evaluation metrics for given model on L8CCA testset. :param model: trained model to make predictions. :param thr: threshold to be used during evaluation. :param dpath: path to dataset. :param rpath: path to directory where results and artifacts should be logged. :param vids: tuple of ids of images which should be used to create visualisations. If contains '*' visualisations will be created for all images in the dataset. :param batch_size: size of generated batches, only one batch is loaded to memory at a time. :param bands: band numbers to load :param bands_names: names of the bands to load. Should have the same number of elements as bands. :param img_ids: if given, process only these images. :param resize: whether to resize loaded img to gt. :param normalize: whether to normalize the image. :param standardize: whether to standardize the image. :param metric_fns: non-Tensorflow metric functions to run evaluation of the model. Must be of the form func(labels_true, labels_pred). :param mlflow: whether to use MLFlow. :param run_name: name of the run. :return: evaluation metrics and evaluation times for scenes. """ Path(rpath).mkdir(parents=True, exist_ok=False) if mlflow: setup_mlflow(run_name) params = dict(locals()) del params["model"] del params["metric_fns"] log_params(params) metrics = {} scene_times = [] for metric_fn in model.metrics: if type(metric_fn) is str: metric_name = metric_fn else: metric_name = metric_fn.__name__ metrics[f"L8CCA_{metric_name}"] = {} for metric_fn in metric_fns: metrics[f"L8CCA_{metric_fn.__name__}"] = {} for tname in os.listdir(dpath): tpath = dpath / tname for img_id in os.listdir(tpath): if img_ids is not None: if img_id not in img_ids: continue print(f"Processing {tname}-{img_id}", flush=True) img_path = tpath / img_id img_pred, scene_time = get_img_pred(path=img_path, model=model, batch_size=batch_size, bands=bands, bands_names=bands_names, resize=resize, normalize=normalize, standardize=standardize) scene_times.append(scene_time) img_gt = load_l8cca_gt(path=img_path) img_pred = unpad(img_pred, img_gt.shape) img_metrics = get_metrics_tf( np.expand_dims(img_gt, axis=0), np.expand_dims((img_pred > thr), axis=0), model.metrics) for metric_fn in model.metrics: if type(metric_fn) is str: metric_name = metric_fn else: metric_name = metric_fn.__name__ metrics[f"L8CCA_{metric_name}"][img_id] = img_metrics[ f"{metric_name}"] for metric_fn in metric_fns: metrics[f"L8CCA_{metric_fn.__name__}"][img_id] = metric_fn( img_gt.reshape(-1), (img_pred > thr).reshape(-1)) print("Average inference time: " + f"{ sum(scene_times) / len(scene_times) } seconds") if img_id in vids or "*" in vids: print(f"Creating visualisation for {img_id}") img_vis = build_rgb_scene_img(img_path, img_id) save_vis(img_id, img_vis, img_pred > thr, img_gt, rpath) if img_metrics["jaccard_index_metric"] < 0.6: print(f"Will make insights for {img_id}", flush=True) y_gt = img_gt.ravel() y_pred = np.round(img_pred.ravel(), decimals=5) make_roc(y_gt, y_pred, rpath / img_id, thr_marker=thr) make_precision_recall(y_gt, y_pred, rpath / img_id, thr_marker=thr) # Make histogram with more rounded predictions # for performance reasons y_pred = np.round(y_pred, decimals=2) make_activation_hist(y_pred, rpath / img_id) return metrics, scene_times
def train(config, run_time): # mlflow_path = os.path.join(os.getcwd(), 'mlflow') # uri = f'file://{mlflow_path}' # mlflow.set_tracking_uri(uri) with mlflow.start_run(): print('Starting model training') dpl = DataPipeline(config, run_time) # TODO: convert to DB dpl.read_data('interim_train_data') train_data = dpl._data print('train:', train_data.shape) # TODO: convert to DB dpl.read_data('interim_test_data') test_data = dpl._data print(test_data.shape) print('Getting model selection') model_selection = int(config.get('DEFAULT', 'model_selection')) # splitting data print('splitting X, y data') X_train = train_data['comment_text'] y_train = train_data.iloc[:, 1:7] X_test = test_data['comment_text'] y_test = test_data.iloc[:, 1:7] # get model params print('getting model params') model_params = _get_model_params(config, model_selection, no_defaults=True) model_params['run_time'] = run_time print('model_params', model_params) # init model print('model init') model = BiLSTM(X_train=X_train, y_train=y_train, validation_data=None, **model_params) # train model print('training model') model.train() # evaluate TODO: create another split from train print('evaluating model') evaluation = model.evaluate(X_test, y_test) print('evaluation', evaluation) # save print('saving model') model.save_model(mlflow, model_params['save_path']) print('logging params') mlflow.log_params(model_params) print(model._history.history) print('logging metrics') mlflow.log_metrics({'loss': evaluation[0], 'accuracy': evaluation[1]}) mlflow.end_run() return
pretrained_embeddings_path=args.pretrained_embeddings, token_to_index=args.token_to_index, n_labels=train_dataset.n_labels, dropout=args.dropout, batch_size=args.batch_size, vector_size=args.embeddings_size, filter_count=args.filter_count, filters_length=args.filters_length, freeze_embedings=True # This can be a hyperparameter ) mlflow.log_params({ "model_type": "Convolutional Neural Network", "embeddings": args.pretrained_embeddings, "batch_size": args.batch_size, "filter_count": args.filter_count, "filters_length": args.filters_length, "dropout": args.dropout, "embeddings_size": args.embeddings_size, "epochs": args.epochs, "comments": args.comments[:249] }) logging.info(str(model)) model = model.to(device) loss = nn.CrossEntropyLoss() optimizer = optim.Adam( model.parameters(), lr=1e-3, # This can be a hyperparameter weight_decay=1e-5 # This can be a hyperparameter )
def log_config(): mlflow.log_param("seed", Config.seed) mlflow.log_param("n_splits", Config.n_splits) mlflow.log_param("bert_model", Config.bert_model) mlflow.log_params({f"lgb_{k}": v for k, v in Config.lgb_params.items()}) mlflow.log_params({f"cat_{k}": v for k, v in Config.cat_params.items()})
def main(c, r): r.scores = {} with blocktimer('Preprocess', level=INFO): # unpack feature set list. set[i]={name: cols} for name, col_list in c.feature.set.items(): in_train_path = f'data/feature/{name}_train.pkl' in_test_path = f'data/feature/{name}_test.pkl' cols = col_list train = pd.read_pickle(in_train_path) test = pd.read_pickle(in_test_path) logger.debug(f'Loaded feature {name}') if c.runtime.use_small_data: frac = 0.001 train = train.sample(frac=frac, random_state=42) test = test.sample(frac=frac, random_state=42) logger.debug(f'train.shape: {train.shape}, test.shape: {test.shape}') # Split into X, y X_train = train.drop(c.feature.target, axis=1) y_train = train[c.feature.target].copy(deep=True) X_test = test del train, test with blocktimer('Tune hyper params', level=INFO): ''' Run optimization ''' mlflow.log_param('type', c.model.type) mlflow.log_param('num_boost_round', c.train.num_boost_round) mlflow.log_param('early_stopping_rounds', c.train.early_stopping_rounds) f = partial(objective, X_train=X_train, y_train=y_train, X_test=X_test, cols=cols, c=c) opt = optuna.create_study( direction='maximize', study_name= f'{experiment_type}_{c.runtime.version}{c.runtime.dsize}', storage= f'sqlite:///data/optimization/{experiment_type}_{c.runtime.version}{c.runtime.dsize}.db', load_if_exists=True) opt.optimize(f, n_trials=c.optimize.n_trials) trial = opt.best_trial r.optimize = {} r.scores.best_trial = trial.number r.scores.best_score = trial.value r.optimize.best_params = trial.params tuned_params = c.model.params.copy() tuned_params.update(trial.params) r.model.tuned_params = tuned_params logger.debug(f'Best trial: {trial.number}') logger.debug(f'Best score: {trial.value}') logger.debug(f'Best params: {trial.params}') mlflow.log_metric('best_trial', trial.number) mlflow.log_metric('best_score', trial.value) mlflow.log_params(trial.params) return r
from __future__ import print_function #dev1 # from sklearn import * from sklearn.datasets import load_iris from sklearn.model_selection import cross_val_score from sklearn import svm from sklearn import metrics import mlflow import mlflow.sklearn if __name__ == '__main__': # mlflow.create_experiment("mlflowproject_demo1") # mlflow.set_experiment("mlflowproject_demo1") X, y = load_iris(return_X_y=True) clf = svm.SVC(kernel='linear', C=10) scores = cross_val_score(clf, X, y, cv=5, scoring='f1_macro') mlflow.log_params({'kernel': 'linear', 'C': 10}) mlflow.log_metrics({"score": scores.mean(), 'score2': scores[0]}) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # mlflow.sklearn.log_model(clf, "iris classification model") print("Model saved in run %s" % mlflow.active_run().info.run_uuid) mlflow.sklearn.log_model(clf, "SVM_CLF_model") # mlflow.sklearn.save_model(clf, 'iris_SVM_clf') # saved this model for mlflow # adding random line for testing # mdl = mlflow.pyfunc.load_model(model_path)
def log_params(self, params): mlflow.log_params(params)
mlflow.tensorflow.autolog() # mlflow.log_params(experiment_config) print( f'BEGINNING: DATASET:{args.dataset_name}|MODEL:{args.model_name}|bsz:{args.batch_size}|lr:{args.base_learning_rate}|Color_type={args.color_type}|regularizer={regularizer}' ) print('-' * 30) trainer = main(experiment_config, experiment_dir) history = trainer.history histories.append((args.dataset_name, args.model_name, history)) mlflow.log_params(args.__dict__) for k, v in trainer.configs.items(): mlflow.log_params(v) print('logged ', k) mlflow_log_history(history) # ######################################### # ######################################### # for model_name in model_names: # for dataset_name in dataset_names: # for lr in learning_rates: # for bsz in batch_sizes: # with mlflow.start_run(run_name=f'{args.model_name}-{args.dataset_name}-{color_type}-lr_{args.base_learning_rate}-bsz_{args.batch_size}', nested=True): # for regularizer in regularizations: # args.batch_size = bsz # args.base_learning_rate = lr
def add_params(self, params_dict): if self.use_mlflow: mlflow.log_params(params_dict)
def train(config): mlflow.set_experiment(config['dataset']) mlflow.log_params(config) print('Random seed: %d' % int(config['seed'])) torch.manual_seed(config['seed']) print("Training {} epochs".format(config['nepochs'])) torch.backends.cudnn.benchmark = True dataset = S3dDataset(root=config['root'], npoints=config['npoints'], train=True, load=True) test_dataset = S3dDataset(root=config['root'], npoints=config['npoints'], train=False, load=True) num_classes = dataset.num_classes if config['balance']: train_weights = get_weights(dataset, 'train', root=config['root'], n_classes=num_classes) test_weights = get_weights(test_dataset, 'test', root=config['root'], n_classes=num_classes) dataloader = torch.utils.data.DataLoader( dataset, batch_sampler=BatchSampler(WeightedRandomSampler( weights=train_weights, num_samples=len(dataset)), batch_size=config['batchsize'], drop_last=True), num_workers=config['workers']) test_dataloader = torch.utils.data.DataLoader( test_dataset, batch_sampler=BatchSampler(WeightedRandomSampler( weights=test_weights, num_samples=len(test_dataset)), batch_size=config['batchsize'], drop_last=True), num_workers=config['workers']) else: dataloader = torch.utils.data.DataLoader( dataset, batch_size=config['batchsize'], shuffle=True, num_workers=config['workers'], drop_last=True) test_dataloader = torch.utils.data.DataLoader( test_dataset, batch_size=config['batchsize'], shuffle=True, num_workers=config['workers'], drop_last=True) print('number of classes: %d' % num_classes) print('train set size: %d | test set size: %d' % (len(dataset), len(test_dataset))) try: os.makedirs(config['outf']) except: pass blue = lambda x: '\033[94m' + x + '\033[0m' yellow = lambda x: '\033[93m' + x + '\033[0m' red = lambda x: '\033[91m' + x + '\033[0m' classifier = PointNetSeg(k=num_classes) model_epoch_cumulatiove_base = 0 if config.get('model'): print('Loading model from: {}'.format(config.get('model'))) classifier.load_state_dict(torch.load(config['model'])) elif config.get('continue'): model_path, model_epoch_cumulatiove_base = get_path_of_last_model( config) if model_path: print('Loading model from: {}'.format(model_path)) classifier.load_state_dict(torch.load(model_path)) # model_path_dir = ... # run_id = "96771d893a5e46159d9f3b49bf9013e2" # pytorch_model = mlflow.pytorch.load_model( # "runs:/" + run_id + "/" + model_path_dir) optimizer = optim.SGD(classifier.parameters(), lr=config['lr'], momentum=config['momentum']) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') classifier.to(device) if config.get('mgpu'): classifier = torch.nn.DataParallel(classifier, device_ids=config['gpuids']) num_batch = len(dataset) / config['batchsize'] for epoch in range(config['nepochs']): train_acc_epoch, train_iou_epoch, test_acc_epoch, test_iou_epoch = [], [], [], [] try: for i, data in enumerate(dataloader): t0 = time.time() points, labels = data points = points.transpose(2, 1) points, labels = points.to(device), labels.to(device) optimizer.zero_grad() classifier = classifier.train() pred, _ = classifier(points) pred = pred.view(-1, num_classes) labels = labels.view(-1, 1)[:, 0] if config['weight']: # unique labels, counts u, c = torch.unique(labels, return_counts=True) print(u, c) w = c.sum().float() / c # weights w = w / w.sum() # normalized weights # filling missing labels weights with zeros w = torch.zeros(num_classes).scatter_(0, u, w) print(w) loss = F.nll_loss(pred, labels, weight=w) else: loss = F.nll_loss(pred, labels) loss.backward() optimizer.step() time_per_step = time.time() - t0 t0 = time.time() pred_choice = pred.data.max(1)[1] correct = pred_choice.eq(labels.data).cpu().sum() train_acc = correct.item() / float( config['batchsize'] * config['npoints']) train_iou = correct.item() / float(2 * config['batchsize'] * config['npoints'] - correct.item()) train_acc_epoch.append(train_acc) train_iou_epoch.append(train_iou) mlflow.log_metric('train_acc', train_acc) mlflow.log_metric('train_iou', train_iou) mlflow.log_metric('train_loss', loss.item()) log_metric_to_mlflow(labels.data.numpy(), pred_choice.numpy(), 'train', target_names=[ 'board', 'floor', 'door', 'bookcase', 'column', 'ceiling', 'wall', 'stairs', 'beam', 'chair', 'clutter', 'table', 'window', 'sofa' ], cm_plot=False, cm_norm='true', root=config['root'], ds_name=config['dataset'], verbose=0) time_per_log = time.time() - t0 mlflow.log_metric('time_per_step', time_per_step) mlflow.log_metric('time_per_log', time_per_log) print( 'epoch %d: %d/%d | train loss: %f | train acc: %f | train iou: %f' % (epoch + 1, i + 1, num_batch + 1, loss.item(), train_acc, train_iou)) if (i + 1) % 10 == 0: j, data = next(enumerate(test_dataloader, 0)) points, labels = data points = points.transpose(2, 1) points, labels = points.to(device), labels.to(device) classifier = classifier.eval() with torch.no_grad(): pred, _ = classifier(points) pred = pred.view(-1, num_classes) labels = labels.view(-1, 1)[:, 0] loss = F.nll_loss(pred, labels) pred_choice = pred.data.max(1)[1] correct = pred_choice.eq(labels.data).cpu().sum() test_acc = correct.item() / float( config['batchsize'] * config['npoints']) test_iou = correct.item() / float(2 * config['batchsize'] * config['npoints'] - correct.item()) test_acc_epoch.append(test_acc) test_iou_epoch.append(test_iou) mlflow.log_metric('test_acc', test_acc) mlflow.log_metric('test_iou', test_iou) mlflow.log_metric('test_loss', loss.item()) mlflow.log_metric('train_acc', train_acc) mlflow.log_metric('train_iou', train_iou) mlflow.log_metric('train_loss', loss.item()) log_metric_to_mlflow(labels.data.numpy(), pred_choice.numpy(), 'test', target_names=[ 'board', 'floor', 'door', 'bookcase', 'column', 'ceiling', 'wall', 'stairs', 'beam', 'chair', 'clutter', 'table', 'window', 'sofa' ], cm_plot=True, root=config['root'], ds_name=config['dataset'], verbose=0) # mlflow.pytorch.log_model(classifier, 'model') # mlfow has no attribute 'pytorch' # mlflow.pytorch.save_model(classifier, os.path.join( # config['outf'], '{}_model_{}.pth'.format( # config['dataset'],model_epoch_cumulatiove_base + epoch)) torch.save( classifier.state_dict(), os.path.join( config['outf'], '{}_model_{}.pth'.format( config['dataset'], model_epoch_cumulatiove_base + epoch))) print( blue( 'epoch %d: %d/%d | test loss: %f | test acc: %f | test iou: %f' ) % (epoch + 1, i + 1, num_batch + 1, loss.item(), test_acc, test_iou)) print( yellow('epoch %d | mean train acc: %f | mean train IoU: %f') % (epoch + 1, np.mean(train_acc_epoch), np.mean(train_iou_epoch))) print( red('epoch %d | mean test acc: %f | mean test IoU: %f') % (epoch + 1, np.mean(test_acc_epoch), np.mean(test_iou_epoch))) except KeyboardInterrupt: print('User interruption') break finally: torch.save( classifier.state_dict(), os.path.join( config['outf'], '{}_model_{}.pth'.format( config['dataset'], model_epoch_cumulatiove_base + epoch)))
def main(args: DictConfig): # Non-strict access to fields OmegaConf.set_struct(args, False) # Adding default estimator params default_names, _, _, default_values, _, _, _ = \ inspect.getfullargspec(instantiate(args.estimator, context_size=0).__class__.__init__) if default_values is not None: args.estimator['defaults'] = { n: str(v) for (n, v) in zip( default_names[len(default_names) - len(default_values):], default_values) } logger.info(OmegaConf.to_yaml(args, resolve=True)) # Data-generating DAG data_path = hydra.utils.to_absolute_path( f'{ROOT_PATH}/{args.data.relative_path}') exp_name = args.data.relative_path.split('/')[-1] adjacency_matrix = np.load( f'{data_path}/DAG{args.data.sample_ind}.npy').astype(int) if exp_name == 'sachs_2005': var_names = np.load(f'{data_path}/sachs-header.npy') else: var_names = [f'x{i}' for i in range(len(adjacency_matrix))] dag = DirectedAcyclicGraph(adjacency_matrix, var_names) # Experiment tracking mlflow.set_tracking_uri(args.exp.mlflow_uri) mlflow.set_experiment(exp_name) # Checking if run exist if check_existing_hash(args, exp_name): logger.info('Skipping existing run.') return else: logger.info('No runs found - perfoming one.') # Loading Train-test data data = np.load(f'{data_path}/data{args.data.sample_ind}.npy') if args.data.standard_normalize: standard_normalizer = StandardScaler() data = standard_normalizer.fit_transform(data) data_train, data_test = train_test_split(data, test_size=args.data.test_ratio, random_state=args.data.split_seed) train_df = pd.DataFrame(data_train, columns=dag.var_names) test_df = pd.DataFrame(data_test, columns=dag.var_names) mlflow.start_run() mlflow.log_params(flatten_dict(args)) mlflow.log_param('data_generator/dag/n', len(var_names)) mlflow.log_param('data_generator/dag/m', int(adjacency_matrix.sum())) mlflow.log_param('data/n_train', len(train_df)) mlflow.log_param('data/n_test', len(test_df)) # Saving artifacts train_df.to_csv( hydra.utils.to_absolute_path(f'{mlflow.get_artifact_uri()}/train.csv'), index=False) test_df.to_csv( hydra.utils.to_absolute_path(f'{mlflow.get_artifact_uri()}/test.csv'), index=False) dag.plot_dag() plt.savefig( hydra.utils.to_absolute_path(f'{mlflow.get_artifact_uri()}/dag.png')) if len(dag.var_names) <= 20: df = pd.concat([train_df, test_df], keys=['train', 'test']).reset_index().drop(columns=['level_1']) g = sns.pairplot(df, plot_kws={'alpha': 0.25}, hue='level_0') g.fig.suptitle(exp_name) plt.savefig( hydra.utils.to_absolute_path( f'{mlflow.get_artifact_uri()}/data.png')) metrics = {} for var_ind, target_var in enumerate(dag.var_names): var_results = {} # Considering all the variables for input input_vars = [var for var in dag.var_names if var != target_var] y_train, X_train = train_df.loc[:, target_var].values, train_df.loc[:, input_vars].values y_test, X_test = test_df.loc[:, target_var].values, test_df.loc[:, input_vars].values # Initialising risks risks = {} for risk in args.predictors.risks: risks[risk] = getattr(importlib.import_module('sklearn.metrics'), risk) # Fitting predictive model models = {} for pred_model in args.predictors.pred_models: logger.info( f'Fitting {pred_model._target_} for target = {target_var} and inputs {input_vars}' ) model = instantiate(pred_model) model.fit(X_train, y_train) y_pred = model.predict(X_test) models[pred_model._target_] = model for risk, risk_func in risks.items(): var_results[f'test_{risk}_{pred_model._target_}'] = risk_func( y_test, y_pred) sampler = instantiate(args.estimator.sampler, X_train=X_train, fit_method=args.estimator.fit_method, fit_params=args.estimator.fit_params) # =================== Relative feature importance =================== # 1. G = MB(target_var), FoI = input_vars / MB(target_var) G_vars_1 = list(dag.get_markov_blanket(target_var)) fsoi_vars_1 = [ var for var in input_vars if var not in list(dag.get_markov_blanket(target_var)) ] prefix_1 = 'mb' # 2. G = input_vars / MB(target_var), FoI = MB(target_var) fsoi_vars_2 = list(dag.get_markov_blanket(target_var)) G_vars_2 = [ var for var in input_vars if var not in list(dag.get_markov_blanket(target_var)) ] prefix_2 = 'non_mb' for (G_vars, fsoi_vars, prefix) in zip([G_vars_1, G_vars_2], [fsoi_vars_1, fsoi_vars_2], [prefix_1, prefix_2]): G = search_nonsorted(input_vars, G_vars) fsoi = search_nonsorted(input_vars, fsoi_vars) rfi_gof_metrics = {} for f, f_var in zip(fsoi, fsoi_vars): estimator = sampler.train([f], G) # GoF diagnostics rfi_gof_results = {} if estimator is not None: rfi_gof_results[f'rfi/gof/{prefix}_mean_log_lik'] = \ estimator.log_prob(inputs=X_test[:, f], context=X_test[:, G]).mean() rfi_gof_metrics = { k: rfi_gof_metrics.get(k, []) + [rfi_gof_results.get(k, np.nan)] for k in set( list(rfi_gof_metrics.keys()) + list(rfi_gof_results.keys())) } # Feature importance if len(fsoi) > 0: var_results[f'rfi/{prefix}_cond_size'] = len(G_vars) for model_name, model in models.items(): for risk, risk_func in risks.items(): rfi_explainer = explainer.Explainer( model.predict, fsoi, X_train, sampler=sampler, loss=risk_func, fs_names=input_vars) mb_explanation = rfi_explainer.rfi( X_test, y_test, G, nr_runs=args.exp.rfi.nr_runs) var_results[f'rfi/{prefix}_mean_rfi_{risk}_{model_name}'] = \ np.abs(mb_explanation.fi_vals(return_np=True)).mean() var_results = { **var_results, **{ k: np.nanmean(v) if len(G_vars) > 0 else np.nan for (k, v) in rfi_gof_metrics.items() } } # TODO =================== Global SAGE =================== mlflow.log_metrics(var_results, step=var_ind) metrics = { k: metrics.get(k, []) + [var_results.get(k, np.nan)] for k in set(list(metrics.keys()) + list(var_results.keys())) } # Logging mean statistics mlflow.log_metrics({k: np.nanmean(v) for (k, v) in metrics.items()}, step=len(dag.var_names)) mlflow.end_run()
def sociable_weavers_transforms(transform_config): """ Creating transforms for birds classification Training: 1. Resize to (image_size, image_size) pixels 2. Apply random rotation for +-[rotation_angle] degrees 3. Apply with [transforms_prob] gaussian blurring with [blur_kernel_size] and variance~U[[min_noise_variance],[max_noise_variance]] 4. Random resized crop to (net_input_size, net_input_size) pixels (according to the network's input size) with ratio=(1.0,1.0) and scale=(0.8,1.2) 5. Applying random horizontal flip 6. Transforming to tensor type 7. Normalizing according to mu=[dataset_means], sigma=[dataset_stds] Validation: 1. Resize to (image_size, image_size) pixels 2. Center crop to (net_input_size, net_input_size) pixels (according to the network's input size) 3. Transforming to tensor type 4. Normalizing according to mu=[dataset_means], sigma=[dataset_stds] """ mlflow.log_params({ 'rotation_angle': int(transform_config['rotation_angle']), 'blur_kernel_size': int(transform_config['blur_kernel_size']), }) im_size = int(transform_config['image_size']) input_size = int(transform_config['net_input_size']) dataset_mean = json.loads(transform_config['dataset_means']) dataset_std = json.loads(transform_config['dataset_stds']) normalize = transforms.Normalize(dataset_mean, dataset_std) transforms_prob = float(transform_config['transforms_prob']) min_std = float(transform_config['blur_min_std']) max_std = float(transform_config['blur_max_std']) mlflow.log_param('Gaussian blur kernel min std', min_std) mlflow.log_param('Gaussian blur kernel max std', max_std) max_noise_variance = float(transform_config['max_noise_variance']) min_noise_variance = float(transform_config['min_noise_variance']) return { TRAIN_PHASE: transforms.Compose([ transforms.Resize([im_size, im_size]), transforms.RandomRotation(int(transform_config['rotation_angle'])), transforms.RandomApply([ transforms.GaussianBlur(int( transform_config['blur_kernel_size']), sigma=(min_std, max_std)) ], p=transforms_prob), transforms.RandomResizedCrop(input_size, scale=(0.8, 1.2), ratio=(1.0, 1.0)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), # transforms.RandomApply([transforms.Lambda( # lambda x: x + (float(torch.rand(1)) * ( # max_noise_variance - min_noise_variance) + min_noise_variance ** 0.5) * torch.randn(3, # input_size, # input_size))], # p=transforms_prob), normalize ]), TEST_PHASE: transforms.Compose([ transforms.Resize(im_size), transforms.CenterCrop(input_size), transforms.ToTensor(), normalize ]) }
def main(): # Training settings parser = argparse.ArgumentParser(description="PyTorch MNIST Example") parser.add_argument( "--batch-size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)", ) parser.add_argument( "--test-batch-size", type=int, default=1000, metavar="N", help="input batch size for testing (default: 1000)", ) parser.add_argument( "--epochs", type=int, default=5, metavar="N", help="number of epochs to train (default: 5)" ) parser.add_argument( "--lr", type=float, default=1.0, metavar="LR", help="learning rate (default: 1.0)" ) parser.add_argument( "--gamma", type=float, default=0.7, metavar="M", help="Learning rate step gamma (default: 0.7)", ) parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") parser.add_argument("--run-name", type=str, default=None, help="Mlflow run name.") parser.add_argument( "--save-model", action="store_true", default=False, help="For Saving the current Model" ) args = parser.parse_args() torch.manual_seed(args.seed) dataset = functools.partial( datasets.MNIST, root="../data", download=True, transform=transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] ), ) train_loader = torch.utils.data.DataLoader( dataset(train=True), batch_size=args.batch_size, shuffle=True ) test_loader = torch.utils.data.DataLoader( dataset(train=False), batch_size=args.test_batch_size, shuffle=True ) model = modelling.Net() optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) mlflow.set_experiment("MNIST CNN") with mlflow.start_run(run_name=args.run_name): mlflow.log_params(vars(args)) for epoch in range(1, args.epochs + 1): train(args, model, train_loader, optimizer, epoch) test(args, model, test_loader) scheduler.step() if args.save_model: model_path = "mnist_cnn.pt" torch.save(model.state_dict(), model_path) mlflow.log_artifact(model_path)
def _mlflow_log_params(params_dict): mlflow.log_params({ "pytorch version": torch.__version__, "ignite version": ignite.__version__, }) mlflow.log_params(params_dict)
def main(): parser = argparse.ArgumentParser(description="PyTorch CIFAR10 Training") parser.add_argument("--model", default="resnet56", type=str, help="Model architecture") parser.add_argument("--dataset", type=str, default="CIFAR10", help="Name of the dataset") parser.add_argument("--batch", type=int, default=128, help="Test batch size") parser.add_argument("--perturb", type=float, default=10.0, help="Magnitude of noise to the input") parser.add_argument("--lr", default=1e-4, type=float, help="learning rate") parser.add_argument("--epochs", type=int, default=20, help="Number of epochs to fine tune") args = parser.parse_args() EXPERIMENT_NAME = "Entropy Minimization" device = "cuda" if torch.cuda.is_available() else "cpu" print("==> Preparing data..") trainloader, testloader = read_vision_dataset("./data", batch_size=args.batch, dataset=args.dataset) print("==> Building model..") net = resnet.__dict__[args.model]() net = net.to(device) net = torch.nn.DataParallel(net) # cudnn.benchmark = True checkpoint = torch.load("ckpt.pth", map_location=device) net.load_state_dict(checkpoint["net"]) net.eval() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) experiment_id = mlflow.set_experiment(EXPERIMENT_NAME) with mlflow.start_run(experiment_id=experiment_id): log_params(vars(args)) for epoch in range(1, args.epochs + 1): train( net, args.perturb, optimizer, testloader, device, epoch, ) test(net, testloader, device, epoch) mlflow.pytorch.log_model(net, artifact_path="tuned model")