parser.add_argument("--output", help="Output from First & Hidden Layers", action='store', nargs='?', default=64, type=int) parser.add_argument("--train_batch_size", help="Training Batch Size", nargs='?', action='store', default=128, type=int) parser.add_argument("--epochs", help="Number of epochs for training", nargs='?', action='store', default=20, type=int) args = parser.parse_args() print("drop_rate", args.drop_rate) print("input_dim", args.input_dim) print("size", args.bs) print("output", args.output) print("train_batch_size", args.train_batch_size) print("epochs", args.epochs) data = gen_data(input_dim=input_dim, bsize=bs) model = build_model(in_dim=input_dim, drate=drop_rate, out=output) start_time: float = time() with mlflow.start_run(): results = compile_and_run_model(model, data, epochs=epochs, batch_size=bs) mlflow.log_param("drop_rate", args.drop_rate) mlflow.log_param("input_dim", args.input_dim) mlflow.log_param("size", args.bs) mlflow.log_param("output", args.output) mlflow.log_param("train_batch_size", args.train_batch_size) mlflow.log_param("epochs", args.epochs) mlflow.log_param("loss", results[0]) mlflow.log_param("acc", results[1]) end_time: float = time() print("Run time = %d" % (end_time-start_time))
fig.savefig('/dbfs/mlflow/iris/iris1.png') plt.close(fig) display() display() !rm -r /dbfs/mlflow/iris # 1st iteration mlflow.start_run() dtc = DecisionTreeClassifier(random_state=10) dtc.fit(X_train,Y_train) y_pred_class= dtc.predict(X_test) accuracy= metrics.accuracy_score(Y_test,y_pred_class) print (accuracy) mlflow.log_param("random_state",10 ) mlflow.log_metric("accuracy" , accuracy) mlflow.sklearn.log_model (dtc , "model") modelpath = "/dbfs/mlflow/iris/model-%s-%f" % ("decision_tree", 1) mlflow.sklearn.save_model (dtc, modelpath) # going to save pickle model alongwith my experiment ,(scikit generates pickle file of model.) mlflow.log_artifact("iris1.png" )# saving my artifacts. other artifacts could be feature columns, data with different versions, #mlflow.end_run() # 2nd iteration mlflow.start_run() dtc = DecisionTreeClassifier(max_depth=1,random_state=10) #change
def test_search_runs(): mlflow.set_experiment("exp-for-search") # Create a run and verify that the current active experiment is the one we just set logged_runs = {} with mlflow.start_run() as active_run: logged_runs["first"] = active_run.info.run_id mlflow.log_metric("m1", 0.001) mlflow.log_metric("m2", 0.002) mlflow.log_metric("m1", 0.002) mlflow.log_param("p1", "a") mlflow.set_tag("t1", "first-tag-val") with mlflow.start_run() as active_run: logged_runs["second"] = active_run.info.run_id mlflow.log_metric("m1", 0.008) mlflow.log_param("p2", "aa") mlflow.set_tag("t2", "second-tag-val") def verify_runs(runs, expected_set): assert set([r.info.run_id for r in runs ]) == set([logged_runs[r] for r in expected_set]) experiment_id = MlflowClient().get_experiment_by_name( "exp-for-search").experiment_id # 2 runs in this experiment assert len(MlflowClient().list_run_infos(experiment_id, ViewType.ACTIVE_ONLY)) == 2 # 2 runs that have metric "m1" > 0.001 runs = MlflowClient().search_runs([experiment_id], "metrics.m1 > 0.0001") verify_runs(runs, ["first", "second"]) # 1 run with has metric "m1" > 0.002 runs = MlflowClient().search_runs([experiment_id], "metrics.m1 > 0.002") verify_runs(runs, ["second"]) # no runs with metric "m1" > 0.1 runs = MlflowClient().search_runs([experiment_id], "metrics.m1 > 0.1") verify_runs(runs, []) # 1 run with metric "m2" > 0 runs = MlflowClient().search_runs([experiment_id], "metrics.m2 > 0") verify_runs(runs, ["first"]) # 1 run each with param "p1" and "p2" runs = MlflowClient().search_runs([experiment_id], "params.p1 = 'a'", ViewType.ALL) verify_runs(runs, ["first"]) runs = MlflowClient().search_runs([experiment_id], "params.p2 != 'a'", ViewType.ALL) verify_runs(runs, ["second"]) runs = MlflowClient().search_runs([experiment_id], "params.p2 = 'aa'", ViewType.ALL) verify_runs(runs, ["second"]) # 1 run each with tag "t1" and "t2" runs = MlflowClient().search_runs([experiment_id], "tags.t1 = 'first-tag-val'", ViewType.ALL) verify_runs(runs, ["first"]) runs = MlflowClient().search_runs([experiment_id], "tags.t2 != 'qwerty'", ViewType.ALL) verify_runs(runs, ["second"]) runs = MlflowClient().search_runs([experiment_id], "tags.t2 = 'second-tag-val'", ViewType.ALL) verify_runs(runs, ["second"]) # delete "first" run MlflowClient().delete_run(logged_runs["first"]) runs = MlflowClient().search_runs([experiment_id], "params.p1 = 'a'", ViewType.ALL) verify_runs(runs, ["first"]) runs = MlflowClient().search_runs([experiment_id], "params.p1 = 'a'", ViewType.DELETED_ONLY) verify_runs(runs, ["first"]) runs = MlflowClient().search_runs([experiment_id], "params.p1 = 'a'", ViewType.ACTIVE_ONLY) verify_runs(runs, [])
def cMAPPS(experiment_name, prepros_params, train_params, dataset_no, tracking=True, path=None): from Input import cMAPSS as ci from Preprocess import cMAPSS as CP from Testing import cMAPSS as ct import Training as tr if tracking == True: mlflow.set_tracking_uri('sqlite:///mlflow.db') mlflow.set_experiment(experiment_name) with mlflow.start_run(): if path is not None: ci.set_datapath(path) if dataset_no in range(1, 5): ci.get_data(dataset_no) mlflow.log_param("DataSet Number", dataset_no) else: raise Exception('Please choose a number between 1 and 4') cp = CP(**prepros_params) cp.preprocess(ci.Train_input) run_id = mlflow.active_run().info.run_id tmpdir = tempfile.TemporaryDirectory() rnn_ff = tr.RNN_to_FF(cp.features, **train_params, model_dir=tmpdir.name, run_id=run_id) rnn_ff.create_model(cp.no_splits) rnn_ff.train_model(cp.splits_in, cp.splits_out, cp.no_splits) mlflow.log_param('Features', cp.features) mlflow.log_params(prepros_params) mlflow.log_params(train_params) mlflow.log_params({ 'MSE_Train': rnn_ff.loss.tolist, 'MSE_Validation': rnn_ff.val_loss.tolist, 'Delta_MSE': rnn_ff.del_loss.tolist }) mlflow.log_artifacts(tmpdir.name) tmpdir.cleanup() # Tags mlflow.set_tags({ 'RMSE_Train': (rnn_ff.loss**0.5).tolist, 'RMSE_Validation': (rnn_ff.val_loss**0.5).tolist }) cp.preprocess(ci.Test_input, isTrain=False) ct.get_score(rnn_ff.model, cp.test_in, ci.RUL_input) mlflow.log_params({ 'Score': ct.score.tolist, 'Test_MSE': ct.mse.tolist, 'Combined Score': ct.cm_score, 'Combined MSE': ct.cm_mse }) mlflow.set_tags({ 'Test RMSE': (ct.mse**0.5).tolist, 'Combined RMSE': ct.cm_mse**0.5 }) else: if path is not None: ci.set_datapath(path) if dataset_no in range(1, 5): ci.get_data(dataset_no) else: raise Exception('Please choose a number between 1 and 4') cp = CP(**prepros_params) cp.preprocess(ci.Train_input) rnn_ff = tr.RNN_to_FF(cp.features, **train_params) rnn_ff.create_model(cp.no_splits) rnn_ff.train_model(cp.splits_in, cp.splits_out, cp.no_splits) cp.preprocess(ci.Test_input, isTrain=False) ct.get_score(rnn_ff.models, cp.test_in, ci.RUL_input) return ci, cp, rnn_ff, ct clear_session()
train_x = train.drop(["quality"], axis=1) test_x = test.drop(["quality"], axis=1) train_y = train[["quality"]] test_y = test[["quality"]] alpha = float(sys.argv[1]) if len(sys.argv) > 1 else 0.5 l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.5 with mlflow.start_run(): lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42) lr.fit(train_x, train_y) predicted_qualities = lr.predict(test_x) (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities) print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio)) print(" RMSE: %s" % rmse) print(" MAE: %s" % mae) print(" R2: %s" % r2) mlflow.log_param("alpha", alpha) mlflow.log_param("l1_ratio", l1_ratio) mlflow.log_metric("rmse", rmse) mlflow.log_metric("r2", r2) mlflow.log_metric("mae", mae) mlflow.sklearn.log_model(lr, "model") print(mlflow.get_artifact_uri())
def train_and_evaluate(config_path: str): logger = logging.getLogger() logger.info('Parsing the config file supplied') config = read_params(config_path) target_col = [config.base.target_col] alpha = config.estimators.ElasticNet.params.alpha l1_ratio = config.estimators.ElasticNet.params.l1_ratio model_dir = Path.cwd() / config.model_dir logger.info('Reading the Training and Test Data') train_df = pd.read_csv(config.split_data['train_path'], sep=',') test_df = pd.read_csv(config.split_data['test_path'], sep=',') train_y = train_df[target_col] test_y = test_df[target_col] train_x = train_df.drop(target_col, axis=1) test_x = test_df.drop(target_col, axis=1) mlflow_config = config.mlflow_config remote_tracking_uri = mlflow_config.remote_server_uri mlflow.set_tracking_uri(remote_tracking_uri) mlflow.set_experiment(mlflow_config.experiment_name) with mlflow.start_run(run_name=mlflow_config.run_name) as mlops_run: logger.info('Training Started Data') lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=config.base.random_state) lr.fit(train_x, train_y) logger.info('Training finished') logger.info('Predicting on the Test Data') predicted_qualities = lr.predict(test_x) (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities) print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio)) print(" RMSE: %s" % rmse) print(" MAE: %s" % mae) print(" R2: %s" % r2) mlflow.log_param("alpha", alpha) mlflow.log_param("l1_ratio", l1_ratio) mlflow.log_metric("rmse", rmse) mlflow.log_metric("mae", mae) mlflow.log_metric("r2", r2) logger.info('Model Saving !!!!') tracking_url_type_store = urlparse(mlflow.get_artifact_uri()).scheme if tracking_url_type_store != "file": mlflow.sklearn.log_model( lr, "model", registered_model_name=mlflow_config.registered_model_name) else: mlflow.sklearn.log_model(lr, "model") logger.info('Training and Evaluating finished Model Saved !!!!')
MAX_DEPTH = 2 model = RandomForestRegressor(n_estimators=N_ESTIMATORS, max_depth=MAX_DEPTH) model = model.fit(x_train, y_train.values.ravel()) # save model joblib.dump(model, 'models/model.joblib') joblib.dump(column_order, 'models/column_order.joblib') if settings.SHOULD_USE_MLFLOW: # log training run to mlflow mlflow.set_tracking_uri(uri=f'http://{settings.MLFLOW_IP}:5000') if os.environ.get('CI', '') == 'true': mlflow.set_experiment('CI') else: mlflow.set_experiment('dev') with mlflow.start_run() as run: # calculate evaluation metrics y_test_pred = model.predict(x_test) rmse = sqrt(metrics.mean_squared_error(y_true=y_test, y_pred=y_test_pred)) r2_score = metrics.r2_score(y_true=y_test, y_pred=y_test_pred) # log hyperparameters to mlflow mlflow.log_param('n_estimators', N_ESTIMATORS) mlflow.log_param('max_depth', MAX_DEPTH) # log metrics to mlflow mlflow.log_metric("rmse_validation_data", rmse) mlflow.log_metric("r2_score_validation_data", r2_score) else: print('Not logging training run because MLFlow tracking server is not up, or its URL is not set in train.py')
exp_name = "ntnu_course_classifier" mlflow.set_experiment(exp_name) with mlflow.start_run() as run: # Get path to save model tracking_uri = mlflow.tracking.get_tracking_uri() print("Logging to " + tracking_uri) artifact_uri = mlflow.get_artifact_uri() if artifact_uri.startswith("file:///"): artifact_uri = artifact_uri.split("file:///")[1] print("Saving artifacts to " + artifact_uri) model_path = artifact_uri + model_prefix # Log params mlflow.log_param("seed", seed) mlflow.log_param("use_idf", use_idf) mlflow.log_param("use_stoplist", use_stoplist) mlflow.log_param("ngrams", args.ngrams) mlflow.log_param("data_file", data_file) mlflow.log_param("model_type", args.model) #if args.model == "nb": mlflow.log_param("alpha", alpha) mlflow.log_param("fit_prior", fit_prior) #if args.model == "log_reg": mlflow.log_param("C", C) ## Data import and cleaning df = pd.read_csv(data_file, usecols=[1, 2, 3, 4, 5, 6]) df = df.dropna()
def run_train_cv(self) -> None: """クロスバリデーションでの学習・評価を行う 学習・評価とともに、各foldのモデルの保存、スコアのログ出力についても行う """ # mlflow mlflow.set_experiment(self.exp_name) mlflow.start_run(run_name=self.run_name) logger.info(f'{self.run_name} - start training cv') scores = [] va_idxes = [] preds = [] # Adversarial validation if self.advanced and 'adversarial_validation' in self.advanced: X_train = self.X_train X_test = self.X_test X_train['target'] = 0 X_test['target'] = 1 X_train = pd.concat([X_train, X_test], sort=False).reset_index(drop=True) y_train = X_train['target'] X_train.drop('target', axis=1, inplace=True) X_test.drop('target', axis=1, inplace=True) self.X_train = X_train self.y_train = y_train # 各foldで学習を行う for i_fold in range(self.cv.n_splits): # 学習を行う logger.info(f'{self.run_name} fold {i_fold} - start training') model, va_idx, va_pred, score = self.train_fold(i_fold) logger.info( f'{self.run_name} fold {i_fold} - end training - score {score}' ) # モデルを保存する model.save_model() # 結果を保持する va_idxes.append(va_idx) scores.append(score) preds.append(va_pred) # 各foldの結果をまとめる va_idxes = np.concatenate(va_idxes) order = np.argsort(va_idxes) preds = np.concatenate(preds, axis=0) preds = preds[order] if self.evaluation_metric == 'log_loss': cv_score = log_loss(self.y_train, preds, eps=1e-15, normalize=True) elif self.evaluation_metric == 'mean_absolute_error': cv_score = mean_absolute_error(self.y_train, preds) elif self.evaluation_metric == 'rmse': cv_score = np.sqrt(mean_squared_error(self.y_train, preds)) elif self.evaluation_metric == 'auc': cv_score = roc_auc_score(self.y_train, preds) elif self.evaluation_metric == 'prauc': cv_score = average_precision_score(self.y_train, preds) logger.info(f'{self.run_name} - end training cv - score {cv_score}') # 予測結果の保存 Data.dump(preds, f'../output/pred/{self.run_name}-train.pkl') # mlflow self.run_id = mlflow.active_run().info.run_id log_param('model_name', self.model_cls.__class__.__name__) log_param('fe_name', self.fe_name) log_param('train_params', self.params) log_param('cv_strategy', str(self.cv)) log_param('evaluation_metric', self.evaluation_metric) log_metric('cv_score', cv_score) log_param( 'fold_scores', dict( zip([f'fold_{i}' for i in range(len(scores))], [round(s, 4) for s in scores]))) log_param('cols_definition', self.cols_definition) log_param('description', self.description) mlflow.end_run()
def log_anomaly(experimentID, run_name, params, traindata, testdata): warnings.filterwarnings("ignore") np.random.seed(40) with mlflow.start_run(experiment_id=experimentID, run_name=run_name) as run: # Create model, train it, and create predictions iForest = IsolationForest(**params) iForest.fit(traindata) ##Predict with train data train_predictions = iForest.predict(traindata) test_predictions = iForest.predict(testdata) # Log model mlflow.sklearn.log_model(iForest, "ib-isolationforest-model") # Log params [mlflow.log_param(param, value) for param, value in params.items()] #Accuracy Metrics train_accuracy = round( list(train_predictions).count(1) / train_predictions.shape[0], 2) #create train accuracy metrics test_accuracy = round( list(test_predictions).count(1) / test_predictions.shape[0], 2) #create test accuracy metrics print('Accuracy Metrics:') print('--------------------------------------------') print(f'Accuracy for train data: {train_accuracy}') print(f'Accuracy for test data: {test_accuracy}') print(' ') # Log metrics mlflow.log_metric("train_accuracy", train_accuracy) mlflow.log_metric("test_accuracy", test_accuracy) #Log Artifact training_scores = iForest.decision_function(traindata) test_scores = iForest.decision_function(testdata) # Create and lot plot fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True) #fig = plt.figure(figsize=(10,8)) title = fig.suptitle("Decision Function Score", fontsize=14) fig.subplots_adjust(top=0.85, wspace=0.3) #ax = fig.add_subplot(1,1,1) ax1.set_title("Decision function score for trainingdata") ax1.hist(training_scores, bins='auto', alpha=0.7, color='#0504aa', rwidth=0.85) #ax1 = fig.add_subplot(2,1,1) ax2.set_title("Decision function score for testdata") ax2.hist(test_scores, bins='auto', alpha=0.7, color='#0504aa', rwidth=0.85) #create Confusion Matrix #cm = confusion_matrix(y_test, test_predictions) #sns.heatmap(cm, annot=True, cmap="coolwarm", fmt="d", linewidths=.5, ax=ax1) n_id = run.info.run_uuid table_name = 'activityhistory' fig_path = 'data' + '/' + 'decisionfunction' + '/' + table_name + '_' + str( n_id) + '.png' fig.savefig(fig_path) mlflow.log_artifact(fig_path) print(fig) return f'RunID:{n_id}'
# In[17]: # train all layers for layer in model.layers: layer.trainable = True callbacks_list = [checkpoint, csv_logger, reduceLROnPlat, early, qwk] model.compile(loss='categorical_crossentropy', # loss=kappa_loss, optimizer=Adam(lr=1e-4)) model.fit_generator( train_mixup, steps_per_epoch=np.ceil(float(len(train_x)) / float(batch_size)), validation_data=valid_generator, validation_steps=np.ceil(float(len(valid_x)) / float(batch_size)), epochs=epochs, verbose=1, workers=1, use_multiprocessing=False, callbacks=callbacks_list) qwk = QWKEvaluation(validation_data=(valid_generator, valid_y), batch_size=batch_size, interval=1) # In[18]: mlflow.log_param("size", SIZE) mlflow.log_metric("qwk", qwk) mlflow.log_artifact("resnet.png")
if __name__ == "__main__": exp_name = 'Iris' ver_name = '0.0' run_name = 'Iris Run' print("Experiment [{0}]".format(exp_name)) mlflow.set_experiment(exp_name) with mlflow.start_run(source_version=ver_name, run_name=run_name): print('Reading dataset') mlflow.log_param("Dataset", 'Iris') iris = sklearn.datasets.load_iris() X = iris.data y = iris.target print('Splitting') test_proportion = 0.4 mlflow.log_param("Test proportion", test_proportion) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_proportion, shuffle=True) print('Creating estimator using Kmeans') kmeans_n_clusters = 3 mlflow.log_param("Kmeans N Clusters", kmeans_n_clusters) est = KMeans(n_clusters=kmeans_n_clusters) print('Fitting estimator')
metavar="LR", help="learning rate (default: 0.1)", ) args = parser.parse_args() dict_args = vars(args) with mlflow.start_run(run_name="Titanic_Captum_mlflow"): ( net, train_features, train_labels, test_features, test_labels, feature_names, ) = train() compute_accuracy(net, train_features, train_labels, title="Train Accuracy") test_input_tensor = compute_accuracy(net, test_features, test_labels, title="Test Accuracy") feature_conductance(net, test_input_tensor) layer_conductance(net, test_input_tensor) neuron_conductance(net, test_input_tensor) mlflow.log_param("Train Size", len(train_labels)) mlflow.log_param("Test Size", len(test_labels))
mlflow.set_experiment(train_data[0].metric_name) mlflow.start_run() mlflow_run_id = mlflow.active_run().info.run_id # keep track of the model name as a mlflow run tag mlflow.set_tag("model", model_mp.model_name) # keep track of labels as tags in the mlflow experiment for label in train_data[0].label_config: mlflow.set_tag(label, train_data[0].label_config[label]) # store the metric with labels as a tag so it can be copied into grafana to view the real metric mlflow.set_tag("metric", metric) # log parameters before run mlflow.log_param("retraining_interval_minutes", str(Configuration.retraining_interval_minutes)) mlflow.log_param("rolling_training_window_size", str(Configuration.rolling_training_window_size)) mlflow.log_param("true_anomaly_threshold", str(Configuration.true_anomaly_threshold)) # initial run with just the train data model_mp.train( prediction_duration=Configuration.retraining_interval_minutes) # store the predicted dataframe and the true dataframe predicted_df = model_mp.predicted_df true_df = Metric(test_data_list[0]).metric_values.set_index("ds") # Label True Anomalies true_df["anomaly"] = label_true_anomalies(
def train(hyp, opt, device, tb_writer=None, wandb=None): logger.info( colorstr("hyperparameters: ") + ", ".join(f"{k}={v}" for k, v in hyp.items())) save_dir, epochs, batch_size, total_batch_size, weights, rank = ( Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank, ) # Directories wdir = save_dir / "weights" wdir.mkdir(parents=True, exist_ok=True) # make dir last = wdir / "last.pt" best = wdir / "best.pt" results_file = save_dir / "results.txt" # Save run settings with open(save_dir / "hyp.yaml", "w") as f: yaml.dump(hyp, f, sort_keys=False) with open(save_dir / "opt.yaml", "w") as f: # yaml.dump(vars(opt), f, sort_keys=False) # opt 実行パラメータ yaml.dump(str(opt), f, sort_keys=False) # Configure plots = not opt.evolve # create plots cuda = device.type != "cpu" init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.SafeLoader) # data dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict["train"] test_path = data_dict["val"] nc = 1 if opt.single_cls else int(data_dict["nc"]) # number of classes names = (["item"] if opt.single_cls and len(data_dict["names"]) != 1 else data_dict["names"]) # class names assert len(names) == nc, "%g names found for nc=%g dataset in %s" % ( len(names), nc, opt.data, ) # check # Model pretrained = weights.endswith(".pt") if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint if hyp.get("anchors"): ckpt["model"].yaml["anchors"] = round( hyp["anchors"]) # force autoanchor model = Model(opt.cfg or ckpt["model"].yaml, ch=3, nc=nc).to(device) # create exclude = ["anchor"] if opt.cfg or hyp.get("anchors") else [ ] # exclude keys state_dict = ckpt["model"].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info( "Transferred %g/%g items from %s" % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = [] # parameter names to freeze (full or partial) for k, v in model.named_parameters(): v.requires_grad = True # train all layers if any(x in k for x in freeze): print("freezing %s" % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp["weight_decay"] *= total_batch_size * accumulate / nbs # scale weight_decay logger.info(f"Scaled weight_decay = {hyp['weight_decay']}") pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_modules(): if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter): pg2.append(v.bias) # biases if isinstance(v, nn.BatchNorm2d): pg0.append(v.weight) # no decay elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter): pg1.append(v.weight) # apply decay if opt.adam: optimizer = optim.Adam(pg0, lr=hyp["lr0"], betas=(hyp["momentum"], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp["lr0"], momentum=hyp["momentum"], nesterov=True) optimizer.add_param_group({ "params": pg1, "weight_decay": hyp["weight_decay"] }) # add pg1 with weight_decay optimizer.add_param_group({"params": pg2}) # add pg2 (biases) logger.info("Optimizer groups: %g .bias, %g conv.weight, %g other" % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR if opt.linear_lr: lf = (lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp["lrf"]) + hyp["lrf"]) # linear else: lf = one_cycle(1, hyp["lrf"], epochs) # cosine 1->hyp['lrf'] scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Logging if rank in [-1, 0] and wandb and wandb.run is None: opt.hyp = hyp # add hyperparameters wandb_run = wandb.init( config=opt, resume="allow", project="YOLOv5" if opt.project == "runs/train" else Path(opt.project).stem, name=save_dir.stem, id=ckpt.get("wandb_id") if "ckpt" in locals() else None, ) loggers = {"wandb": wandb} # loggers dict # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt["optimizer"] is not None: optimizer.load_state_dict(ckpt["optimizer"]) best_fitness = ckpt["best_fitness"] # Results if ckpt.get("training_results") is not None: with open(results_file, "w") as file: file.write(ckpt["training_results"]) # write results.txt # Epochs start_epoch = ckpt["epoch"] + 1 if opt.resume: assert ( start_epoch > 0 ), "%s training to %g epochs is finished, nothing to resume." % ( weights, epochs, ) if epochs < start_epoch: logger.info( "%s has been trained for %g epochs. Fine-tuning for %g additional epochs." % (weights, ckpt["epoch"], epochs)) epochs += ckpt["epoch"] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(model.stride.max()) # grid size (max stride) nl = model.model[ -1].nl # number of detection layers (used for scaling hyp['obj']) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info("Using SyncBatchNorm()") # EMA ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader dataloader, dataset = create_dataloader( train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers, image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr("train: "), ) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert ( mlc < nc ), "Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g" % ( mlc, nc, opt.data, nc - 1, ) # Process 0 if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader( test_path, imgsz_test, batch_size * 2, gs, opt, # testloader hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers, pad=0.5, prefix=colorstr("val: "), )[0] if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) if plots: plot_labels(labels, save_dir, loggers) if tb_writer: tb_writer.add_histogram("classes", c, 0) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp["anchor_t"], imgsz=imgsz) # Model parameters hyp["box"] *= 3.0 / nl # scale to layers hyp["cls"] *= nc / 80.0 * 3.0 / nl # scale to classes and layers hyp["obj"] *= (imgsz / 640)**2 * 3.0 / nl # scale to image size and layers model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) model.class_weights = ( labels_to_class_weights(dataset.labels, nc).to(device) * nc ) # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp["warmup_epochs"] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0 ) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) compute_loss = ComputeLoss(model) # init loss class logger.info(f"Image sizes {imgsz} train, {imgsz_test} test\n" f"Using {dataloader.num_workers} dataloader workers\n" f"Logging results to {save_dir}\n" f"Starting training for {epochs} epochs...") for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = (model.class_weights.cpu().numpy() * (1 - maps)**2 / nc ) # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices( range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ("\n" + "%10s" * 8) % ("Epoch", "gpu_mem", "box", "obj", "cls", "total", "targets", "img_size")) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _, ) in ( pbar ): # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = (imgs.to(device, non_blocking=True).float() / 255.0 ) # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x["lr"] = np.interp( ni, xi, [ hyp["warmup_bias_lr"] if j == 2 else 0.0, x["initial_lr"] * lf(epoch), ], ) if "momentum" in x: x["momentum"] = np.interp( ni, xi, [hyp["warmup_momentum"], hyp["momentum"]]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode="bilinear", align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss( pred, targets.to(device)) # loss scaled by batch_size if rank != -1: loss *= (opt.world_size ) # gradient averaged between devices in DDP mode if opt.quad: loss *= 4.0 # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = "%.3gG" % (torch.cuda.memory_reserved() / 1e9 if torch.cuda.is_available() else 0) # (GB) s = ("%10s" * 2 + "%10.4g" * 6) % ( "%g/%g" % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1], ) pbar.set_description(s) # Plot if plots and ni < 3: f = save_dir / f"train_batch{ni}.jpg" # filename Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start() # if tb_writer: # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard elif plots and ni == 10 and wandb: wandb.log( { "Mosaics": [ wandb.Image(str(x), caption=x.name) for x in save_dir.glob("train*.jpg") if x.exists() ] }, commit=False, ) # end batch ------------------------------------------------------------------------------------------------ # end epoch ---------------------------------------------------------------------------------------------------- # Scheduler lr = [x["lr"] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr( model, include=[ "yaml", "nc", "hyp", "gr", "names", "stride", "class_weights", ], ) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test( opt.data, batch_size=batch_size * 2, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, verbose=nc < 50 and final_epoch, plots=plots and final_epoch, log_imgs=opt.log_imgs if wandb else 0, compute_loss=compute_loss, ) # Write with open(results_file, "a") as f: f.write( s + "%10.4g" * 7 % results + "\n") # P, R, [email protected], [email protected], val_loss(box, obj, cls) if len(opt.name) and opt.bucket: os.system("gsutil cp %s gs://%s/results/results%s.txt" % (results_file, opt.bucket, opt.name)) # Log tags = [ "train/box_loss", "train/obj_loss", "train/cls_loss", # train loss "metrics/precision", "metrics/recall", "metrics/mAP_0.5", "metrics/mAP_0.5:0.95", "val/box_loss", "val/obj_loss", "val/cls_loss", # val loss "x/lr0", "x/lr1", "x/lr2", ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): if tb_writer: tb_writer.add_scalar(tag, x, epoch) # tensorboard if wandb: wandb.log({tag: x}, step=epoch, commit=tag == tags[-1]) # W&B # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, "r") as f: # create checkpoint ckpt = { "epoch": epoch, "best_fitness": best_fitness, "training_results": f.read(), "model": ema.ema, "optimizer": None if final_epoch else optimizer.state_dict(), "wandb_id": wandb_run.id if wandb else None, } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers final = best if best.exists() else last # final model for f in [last, best]: if f.exists(): strip_optimizer(f) # strip optimizers if opt.bucket: os.system(f"gsutil cp {final} gs://{opt.bucket}/weights") # upload # Plots if plots: plot_results(save_dir=save_dir) # save as results.png if wandb: files = [ "results.png", "confusion_matrix.png", *[f"{x}_curve.png" for x in ("F1", "PR", "P", "R")], ] wandb.log({ "Results": [ wandb.Image(str(save_dir / f), caption=f) for f in files if (save_dir / f).exists() ] }) if opt.log_artifacts: wandb.log_artifact(artifact_or_path=str(final), type="model", name=save_dir.stem) # Test best.pt logger.info("%g epochs completed in %.3f hours.\n" % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) if opt.data.endswith("coco.yaml") and nc == 80: # if COCO for conf, iou, save_json in ( [0.25, 0.45, False], [0.001, 0.65, True], ): # speed, mAP tests results, _, _ = test.test( opt.data, batch_size=batch_size * 2, imgsz=imgsz_test, conf_thres=conf, iou_thres=iou, model=attempt_load(final, device).half(), single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, save_json=save_json, plots=False, ) else: dist.destroy_process_group() wandb.run.finish() if wandb and wandb.run else None torch.cuda.empty_cache() # mlflow with mlflow.start_run() as run: # Log args into mlflow for key, value in hyp.items(): mlflow.log_param(key, value) for key, value in vars(opt).items(): mlflow.log_param(key, value) # Log results into mlflow for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): # xがtorch.Tensorだったらfloatに直す if torch.is_tensor(x): x = x.item() # tag名に特殊記号があれば削除する if ":" in tag: tag = re.sub(r":", " ", tag) mlflow.log_metric(tag, x) # Log model mlflow.pytorch.log_model(model, "model") return results
logger.exception( "Unable to download training & test CSV, check your internet connection. Error: %s", e) # Useful for multiple runs (only doing one run in this sample notebook) with mlflow.start_run(): m = Prophet() m.fit(df) # Evaluate Metrics df_cv = cross_validation(m, initial="730 days", period="180 days", horizon="365 days") df_p = performance_metrics(df_cv, rolling_window=rolling_window) # Print out metrics print("Prophet model (rolling_window=%f):" % (rolling_window)) print(" CV: \n%s" % df_cv.head()) print(" Perf: \n%s" % df_p.head()) # Log parameter, metrics, and model to MLflow mlflow.log_param("rolling_window", rolling_window) mlflow.log_metric("rmse", df_p.loc[0, "rmse"]) mlflow.pyfunc.log_model("model", conda_env=conda_env, python_model=FbProphetWrapper(m)) print("Logged model with URI: runs:/{run_id}/model".format( run_id=mlflow.active_run().info.run_id))
}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) # learning rate scheduler warmup_steps = args.warmup_epochs * len(train_iterator) print(f"warmup steps: {warmup_steps}") scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps) with mlflow.start_run(): # Log our parameters into mlflow for key, value in vars(args).items(): mlflow.log_param(key, value) comment = os.path.basename(args.output_dir) comment = "" if comment == "" else "_" + comment tb_writer = SummaryWriter(comment=comment) for epoch in trange(N_EPOCHS, desc="Epoch"): start_time = time.time() train_loss, train_mpp_acc = train( model, train_iterator, optimizer, sc_criterion,
y_train, scoring='accuracy', return_estimator=True ) score_mean = val_info['test_score'].mean() score_std = val_info['test_score'].std() print(f'{score_mean} accuracy with a standard deviation of {score_std}') # Cell clf = val_info['estimator'][0] # Cell importance_df = pd.DataFrame({'feature':X_train.columns, 'importance': clf.feature_importances_}).sort_values('importance', ascending=False) # Cell importance_df.to_html('../output/data/feature_importance.html', index=False) # Cell if LOG_MLFLOW: with mlflow.start_run(experiment_id=EX_ID): mlflow.log_param('num_features', X_train.shape[1]) mlflow.log_param('n_estimators', clf.get_params()['n_estimators']) mlflow.log_param('max_depth', clf.get_params()['max_depth']) mlflow.log_param('learning_rate', clf.get_params()['learning_rate']) mlflow.log_param('booster', clf.get_params()['booster']) mlflow.log_metric('mean_accuracy', score_mean) mlflow.log_metric('std_accuracy', score_std) mlflow.log_artifact('../output/data/feature_importance.html')
lr = 1e-3 """Generate models from parameters""" for deep in depths: for hidden in hidden_channels: model = GUNET(in_ch=1, hid_ch=800, depth=deep, out_ch=2, pool_ratios=pooling_ratios).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0.001) """Start the training session""" with start_run(): log_param('hidden_channel', hidden) log_param('pooling_ratios', pooling_ratios) log_param('learning_rate', lr) log_param('depth', deep) for epoch in range(epochs): train_loss = [] val_loss = [] running_train_los = [] start_time = time.time() for data in sift_train_loader: data = data.to(device) model.train() optimizer.zero_grad() out = loss(model(data), data.y) out.backward()
def main(params: dict, output_dir: str): import mlflow print("start params={}".format(params)) model_id = "train_0" logger = get_logger() # df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle") df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(30000) df["prior_question_had_explanation"] = df["prior_question_had_explanation"].fillna(-1) column_config = { ("content_id", "content_type_id"): {"type": "category"}, "user_answer": {"type": "category"}, "part": {"type": "category"}, "prior_question_elapsed_time_bin300": {"type": "category"}, "duration_previous_content_bin300": {"type": "category"}, "prior_question_had_explanation": {"type": "category"}, "rating_diff_content_user_id": {"type": "numeric"}, "qq_table2_mean": {"type": "numeric"}, "qq_table2_min": {"type": "numeric"} } if not load_pickle or is_debug: feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"]["DurationPreviousContent"] = DurationPreviousContent() feature_factory_dict["user_id"]["ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(groupby="user_id", column="content_id", is_debug=is_debug, model_id=model_id, n=300) feature_factory_dict["user_id"]["UserContentRateEncoder"] = UserContentRateEncoder(rate_func="elo", column="user_id") feature_factory_dict["user_id"]["QuestionQuestionTableEncoder2"] = \ QuestionQuestionTableEncoder2( model_id=model_id, is_debug=is_debug, past_n=100, min_size=300 ) feature_factory_manager = FeatureFactoryManager(feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="train_0", load_feature=not is_debug, save_feature=not is_debug) print("all_predict") df = feature_factory_manager.all_predict(df) df = df[["user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly", "prior_question_elapsed_time_bin300", "duration_previous_content_bin300", "prior_question_had_explanation", "rating_diff_content_user_id", "qq_table2_mean", "qq_table2_min"]].replace(-99, -1) df["qq_table2_mean"] = df["qq_table2_mean"].fillna(0.65) df["qq_table2_min"] = df["qq_table2_min"].fillna(0.6) print(df.head(10)) print("data preprocess") train_idx = [] val_idx = [] np.random.seed(0) for _, w_df in df[df["content_type_id"] == 0].groupby("user_id"): if np.random.random() < 0.01: # all val val_idx.extend(w_df.index.tolist()) else: train_num = int(len(w_df) * 0.95) train_idx.extend(w_df[:train_num].index.tolist()) val_idx.extend(w_df[train_num:].index.tolist()) ff_for_transformer = FeatureFactoryForTransformer(column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) ff_for_transformer.make_dict(df=df) n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")]) if not load_pickle or is_debug: df["is_val"] = 0 df["is_val"].loc[val_idx] = 1 w_df = df[df["is_val"] == 0] w_df["group"] = (w_df.groupby("user_id")["user_id"].transform("count") - w_df.groupby("user_id").cumcount()) // params["max_seq"] w_df["user_id"] = w_df["user_id"].astype(str) + "_" + w_df["group"].astype(str) group = ff_for_transformer.all_predict(w_df) dataset_train = SAKTDataset(group, n_skill=n_skill, max_seq=params["max_seq"]) del w_df gc.collect() ff_for_transformer = FeatureFactoryForTransformer(column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) if not load_pickle or is_debug: group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0]) dataset_val = SAKTDataset(group, is_test=True, n_skill=n_skill, max_seq=params["max_seq"]) os.makedirs("../input/feature_engineering/model080", exist_ok=True) if not is_debug and not load_pickle: with open(f"../input/feature_engineering/model080/train.pickle", "wb") as f: pickle.dump(dataset_train, f) with open(f"../input/feature_engineering/model080/val.pickle", "wb") as f: pickle.dump(dataset_val, f) if not is_debug and load_pickle: with open(f"../input/feature_engineering/model080/train.pickle", "rb") as f: dataset_train = pickle.load(f) with open(f"../input/feature_engineering/model080/val.pickle", "rb") as f: dataset_val = pickle.load(f) print("loaded!") dataloader_train = DataLoader(dataset_train, batch_size=params["batch_size"], shuffle=True, num_workers=1) dataloader_val = DataLoader(dataset_val, batch_size=params["batch_size"], shuffle=False, num_workers=1) model = SAKTModel(n_skill, embed_dim=params["embed_dim"], max_seq=params["max_seq"], dropout=dropout) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=params["lr"], weight_decay=0.01, ) num_train_optimization_steps = int(len(dataloader_train) * epochs) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=params["num_warmup_steps"], num_training_steps=num_train_optimization_steps) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) for epoch in range(epochs): loss, acc, auc, auc_val = train_epoch(model, dataloader_train, dataloader_val, optimizer, criterion, scheduler, device) print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}".format(epoch, loss, auc, auc_val)) preds = [] labels = [] with torch.no_grad(): for item in tqdm(dataloader_val): x = item["x"].to(device).long() target_id = item["target_id"].to(device).long() part = item["part"].to(device).long() label = item["label"].to(device).float() elapsed_time = item["elapsed_time"].to(device).long() duration_previous_content = item["duration_previous_content"].to(device).long() prior_question_had_explanation = item["prior_q"].to(device).long() user_answer = item["user_answer"].to(device).long() rate_diff = item["rate_diff"].to(device).float() qq_table_mean = item["qq_table_mean"].to(device).float() qq_table_min = item["qq_table_min"].to(device).float() output = model(x, target_id, part, elapsed_time, duration_previous_content, prior_question_had_explanation, user_answer, rate_diff, qq_table_mean, qq_table_min) preds.extend(torch.nn.Sigmoid()(output[:, -1]).view(-1).data.cpu().numpy().tolist()) labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist()) auc_transformer = roc_auc_score(labels, preds) print("single transformer: {:.4f}".format(auc_transformer)) df_oof = pd.DataFrame() # df_oof["row_id"] = df.loc[val_idx].index print(len(dataloader_val)) print(len(preds)) df_oof["predict"] = preds df_oof["target"] = labels df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False) """ df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv") df_oof2.columns = ["row_id", "predict_lgbm", "target"] df_oof2 = pd.merge(df_oof, df_oof2, how="inner") auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values) print("lgbm: {:.4f}".format(auc_lgbm)) print("ensemble") max_auc = 0 max_nn_ratio = 0 for r in np.arange(0, 1.05, 0.05): auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r) print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc)) if max_auc < auc: max_auc = auc max_nn_ratio = r print(len(df_oof2)) """ if not is_debug: mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__)) for key, value in params.items(): mlflow.log_param(key, value) mlflow.log_metric("auc_val", auc_transformer) mlflow.end_run() torch.save(model.state_dict(), f"{output_dir}/transformers.pth") del model torch.cuda.empty_cache() with open(f"{output_dir}/transformer_param.json", "w") as f: json.dump(params, f) if is_make_feature_factory: # feature factory feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"]["DurationPreviousContent"] = DurationPreviousContent(is_partial_fit=True) feature_factory_dict["user_id"]["ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_manager = FeatureFactoryManager(feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="all", load_feature=not is_debug, save_feature=not is_debug) ff_for_transformer = FeatureFactoryForTransformer(column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle") if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) feature_factory_manager.fit(df) df = feature_factory_manager.all_predict(df) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open(f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
from __future__ import print_function import mlflow experiment_id = 0 name = 'HelloWorld Nested Runs' with mlflow.start_run(experiment_id=experiment_id, run_name=name) as run: print("runId:",run.info.run_uuid," - name:",name) mlflow.set_tag("algo", name) with open("info.txt", "w") as f: f.write(name) mlflow.log_artifact("info.txt") for j in range(0, 2): name2 = name + " " + str(j) with mlflow.start_run(run_name=name2, nested=True) as run2: print("runId.2:",run2.info.run_uuid," - name2:",name2) mlflow.log_param("alpha", str(j+0.1)) mlflow.log_metric("auroch", j+0.123) mlflow.set_tag("algo", name2) with open("info.txt", "w") as f: f.write(name2) mlflow.log_artifact("info.txt")
def run(runParams: dict) -> float: if not runParams["paramsFile"] is None: with open(runParams["paramsFile"], "r") as f: params = yaml.safe_load(f) params["device"] = runParams["device"] params["subDeviceIdx"] = runParams["subDeviceIdx"] else: params = runParams if params["device"] != "cuda": use_cuda = False else: use_cuda = torch.cuda.is_available() if params["subDeviceIdx"] is None: subDeviceIdx = 0 else: subDeviceIdx = params["subDeviceIdx"] device = torch.device( "cuda:{}".format(subDeviceIdx) if use_cuda else "cpu") seed = params["training"]["seed"] if seed is None: seed = np.random.randint(10000) logger.debug("Using random seed") np.random.seed(seed) torch.manual_seed(seed) if use_cuda: torch.cuda.manual_seed(seed) dataList = _asList(params["training"]["datasets"]) logger.info(f"Datasets: {dataList}") nDatasets = len(dataList) batchSize = params["training"]["batchSize"] epochs = _asList(params["training"]["epochs"], nDatasets) dataShape = params["training"]["inputShape"] dataSize = dataShape[-2:] dataChannels = dataShape[1] fullLossFactor = _asList(params["training"]["fullLossCoeff"], nDatasets) learningRate = _asList(params["training"]["learningRate"], nDatasets) ewcCoeff = params["training"]["importance"] ewcSampleSize = params["training"]["sampleSize"] expName = _genExpName(dataList) experiment = mlflow.get_experiment_by_name(expName) if experiment is None: logger.info("Creating new experiment") expID = mlflow.create_experiment(expName) else: logger.info(f"Using existing experiment") expID = experiment.experiment_id with mlflow.start_run(experiment_id=expID): modelName = params["name"] mlflow.log_param("params", params) mlflow.log_param("name", modelName) coder = Codec(0, Direction.Forward, params["training"]["inputShape"], params["coder"]).to(device) logDir = "../data/logs/" + modelName tbWriter = SummaryWriter(logDir) previousTestData = [] previousClassifiers = [] for datasetIdx, dataset in enumerate(dataList): embeddingSize = params["training"]["embeddingSize"] # one decoder per dataset classifier = Decoder( _asList(params["decoder"], nDatasets)[datasetIdx], embeddingSize, 0, params["training"]["inputShape"], device, False, ).to(device) logger.info(f"\n\t==== {dataset}: TRAINING ====\n") train_loader, test_loader = getDatasets(dataset, batchSize, dataSize, dataChannels) optimParams = chain(coder.parameters(), classifier.parameters()) optimizer = optim.Adam(optimParams, lr=learningRate[datasetIdx]) if datasetIdx > 0: ewc = EWC( coder, getTrainingSamples(dataList[:datasetIdx], ewcSampleSize, dataSize, dataChannels), device, ) else: ewc = None currentAcc = 0 currentCorrect = 0 currentTotalSize = len(test_loader.dataset) for epoch in range(1, epochs[datasetIdx] + 1): train( coder, classifier, ewc, ewcCoeff, device, train_loader, optimizer, epoch, fullLossFactor[datasetIdx], tbWriter, ) logger.info(f"\n\t==== {dataset}: TEST ({dataset}) ====\n") currentAcc, currentCorrect = test( coder, classifier, device, test_loader, len(train_loader), epoch, fullLossFactor[datasetIdx], tbWriter, ) # saveModel(coder, f'../data/{modelName}_{dataset}.pt') # saveModel(classifier, f'../data/{modelName}_{dataset}_classifier.pt') # # trainBuffer = encodeDataset(coder, device, train_loader) # testBuffer = encodeDataset(coder, device, test_loader) # # saveData(trainBuffer,f'../data/{modelName}_{dataset}_trainEmbeddings.npz') # saveData(testBuffer,f'../data/{modelName}_{dataset}_testEmbeddings.npz') totalCorrect = currentCorrect totalDatasetSize = currentTotalSize for pIdx, pTestData in enumerate(previousTestData): previousDataName = dataList[pIdx] logger.info( f"\n\t==== {dataset}: Lifelong TEST ({previousDataName}) " f"====\n") testDatasetSize = len(pTestData.dataset) totalDatasetSize += testDatasetSize logger.info("Re-encoding test data") newEncodedTestData = encodeDataset(coder, device, pTestData) # saveData(newEncodedTestData, # f'../data/{modelName}_lifelong_{previousDataName}' # f'_testEmbeddings.npz') pClassifier = previousClassifiers[pIdx].to(device) previousAcc, previousCorrect = testClassifier( pClassifier, newEncodedTestData, testDatasetSize, epochs[datasetIdx], f"lifelong/{previousDataName}", ) totalCorrect += previousCorrect globalAcc = 100.0 * totalCorrect / totalDatasetSize mlflow.log_metric("lifelong/globalAccuracy", globalAcc, epochs[datasetIdx]) logger.info( f"Global accuracy at task {pIdx}: {globalAcc:.0f}% " f"({totalCorrect}/{totalDatasetSize})") previousTestData.append(test_loader) previousClassifiers.append(classifier.cpu()) mlflow.log_artifacts(logDir, artifact_path="events") return currentAcc
import os from random import random from mlflow import log_metric, log_param, log_artifacts if __name__ == "__main__": print("Running test.py") log_param("param1", 5) log_metric("foo", random()) log_metric("foo", random() + 1) log_metric("foo", random() + 2) if not os.path.exists("outputs"): os.makedirs("outputs") with open("outputs/test.txt", "w") as f: f.write("hello world!") log_artifacts("outputs")
def main(): args = read_args() print_args(args) experiment_name = args.experiment_name batch_size = args.batch_size learning_rate = args.learning_rate hidden_layer_sizes = args.hidden_layer_sizes dropout = args.dropout epochs = args.epochs ### Output directory dir_name = log_dir_name(args) print() print(dir_name) print() output_dir = os.path.join('experiments', experiment_name, dir_name) if not os.path.exists(output_dir): os.makedirs(output_dir) dataset, dev_dataset, test_dataset = load_dataset(args.dataset_dir) nlabels = dataset[TARGET_COL].unique().shape[0] columns = ['Gender', 'Color1', 'Breed1'] one_hot_columns, embedded_columns, numeric_columns = build_columns(dataset, columns) # TODO (optional) put these three types of columns in the same dictionary with "column types" X_train, y_train = process_features(dataset, one_hot_columns, numeric_columns, embedded_columns) direct_features_input_shape = (X_train['direct_features'].shape[1],) X_dev, y_dev = process_features(dev_dataset, one_hot_columns, numeric_columns, embedded_columns) ########################################################################################################### ### TODO: Shuffle train dataset - Done ########################################################################################################### shuffle_len = X_train['direct_features'].shape[0] train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(shuffle_len).batch(batch_size) ########################################################################################################### dev_ds = tf.data.Dataset.from_tensor_slices((X_dev, y_dev)).batch(batch_size) test_ds = tf.data.Dataset.from_tensor_slices(process_features( test_dataset, one_hot_columns, numeric_columns, embedded_columns, test=True)[0]).batch(batch_size) ########################################################################################################### ### TODO: Build the Keras model - Done ########################################################################################################### tf.keras.backend.clear_session() # Add one input and one embedding for each embedded column embedding_layers = [] inputs = [] for embedded_col, max_value in embedded_columns.items(): input_layer = layers.Input(shape=(1,), name=embedded_col) inputs.append(input_layer) # Define the embedding layer embedding_size = int(max_value / 4) embedding_layers.append( tf.squeeze(layers.Embedding(input_dim=max_value, output_dim=embedding_size)(input_layer), axis=-2)) print('Adding embedding of size {} for layer {}'.format(embedding_size, embedded_col)) # Add the direct features already calculated direct_features_input = layers.Input(shape=direct_features_input_shape, name='direct_features') inputs.append(direct_features_input) # Concatenate everything together features = layers.concatenate(embedding_layers + [direct_features_input]) denses = [] dense1 = layers.Dense(hidden_layer_sizes[0], activation='relu')(features) denses.append(dense1) if len(hidden_layer_sizes) > 1: for hidden_layer_size in hidden_layer_sizes[1:]: dense = layers.Dense(hidden_layer_size, activation='relu')(denses[-1]) denses.append(dense) output_layer = layers.Dense(nlabels, activation='softmax')(dense1) model = models.Model(inputs=inputs, outputs=output_layer) ########################################################################################################### ########################################################################################################### ### TODO: Fit the model - Done ########################################################################################################### mlflow.set_experiment(experiment_name) optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) logdir = "logs/scalars/" + dir_name tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir) with mlflow.start_run(nested=True): # Log model hiperparameters first mlflow.log_param('hidden_layer_size', hidden_layer_sizes) mlflow.log_param('dropout', dropout) mlflow.log_param('embedded_columns', embedded_columns) mlflow.log_param('one_hot_columns', one_hot_columns) mlflow.log_param('numeric_columns', numeric_columns) # Not using these yet mlflow.log_param('epochs', epochs) mlflow.log_param('batch_size', batch_size) mlflow.log_param('learning_rate', learning_rate) # Train history = model.fit(train_ds, epochs=epochs, validation_data=dev_ds, callbacks=[tensorboard_callback]) ####################################################################################################### ### TODO: analyze history to see if model converges/overfits ####################################################################################################### output_csv = os.path.join(output_dir, 'history.pickle') with open(output_csv, 'bw') as f: pickle.dump(history.history, f) ####################################################################################################### ####################################################################################################### ### TODO: Evaluate the model, calculating the metrics. - Done ####################################################################################################### loss, accuracy = model.evaluate(dev_ds) print("*** Dev loss: {} - accuracy: {}".format(loss, accuracy)) mlflow.log_metric('loss', loss) mlflow.log_metric('accuracy', accuracy) predictions = model.predict(test_ds) ####################################################################################################### ####################################################################################################### ### TODO: Convert predictions to classes - Done ####################################################################################################### prediction_classes = np.argmax(predictions, axis=1) ####################################################################################################### ####################################################################################################### ### TODO: Save the results for submission - Done ####################################################################################################### output_csv = os.path.join(output_dir, 'submit.csv') submissions = pd.DataFrame(prediction_classes, columns=[TARGET_COL], index=test_dataset.PID) submissions.to_csv(output_csv)
train_y = train[["quality"]] test_y = test[["quality"]] alpha = float(sys.argv[1]) if len(sys.argv) > 1 else 0.5 l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.5 random_state = random.randint(1, 100) with mlflow.start_run(): lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=random_state) lr.fit(train_x, train_y) predicted_qualities = lr.predict(test_x) (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities) print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio)) print(" RMSE: %s" % rmse) print(" MAE: %s" % mae) print(" R2: %s" % r2) mlflow.log_param("alpha", alpha) mlflow.log_param("l1_ratio", l1_ratio) mlflow.log_param("random_state", random_state) mlflow.log_metric("rmse", rmse) mlflow.log_metric("r2", r2) mlflow.log_metric("mae", mae) mlflow.sklearn.log_model(lr, "model")
def main(args): data_dir = args.data_dir figure_path = args.figure_dir model_path = args.model_dir # Generate the data input path list. Each subject has 3 runs stored in 3 different files. subj_id = "/sub" + str(args.sub) + "/ball0" raw_fnames = [ "".join([data_dir, subj_id, str(i), "_sss_trans.fif"]) for i in range(1 if args.sub != 3 else 2, 4) ] # local # subj_id = "/sub"+str(args.sub)+"/ball" # raw_fnames = ["".join([data_dir, subj_id, str(i), "_sss.fif"]) for i in range(1, 2)] # Set skip_training to False if the model has to be trained, to True if the model has to be loaded. skip_training = False # Set the torch device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Device = {}".format(device)) # Initialize parameters parameters = Params_tunable( subject_n=args.sub, hand=args.hand, batch_size=args.batch_size, valid_batch_size=args.batch_size_valid, test_batch_size=args.batch_size_test, epochs=args.epochs, lr=args.learning_rate, duration=args.duration, overlap=args.overlap, patience=args.patience, device=device, y_measure=args.y_measure, s_n_layer=args.s_n_layer, # s_kernel_size=args.s_kernel_size, # Local s_kernel_size=json.loads(" ".join(args.s_kernel_size)), t_n_layer=args.t_n_layer, # t_kernel_size=args.t_kernel_size, # Local t_kernel_size=json.loads(" ".join(args.t_kernel_size)), max_pooling=args.max_pooling, ff_n_layer=args.ff_n_layer, ff_hidden_channels=args.ff_hidden_channels, dropout=args.dropout, activation=args.activation, ) # Set if generate with RPS values or not (check network architecture used later) rps = True # Generate the custom dataset if rps: dataset = MEG_Dataset( raw_fnames, parameters.duration, parameters.overlap, parameters.y_measure, normalize_input=True, ) else: dataset = MEG_Dataset_no_bp( raw_fnames, parameters.duration, parameters.overlap, parameters.y_measure, normalize_input=True, ) # split the dataset in train, test and valid sets. train_len, valid_len, test_len = len_split(len(dataset)) print( "{} + {} + {} = {}?".format( train_len, valid_len, test_len, len(dataset) ) ) # train_dataset, valid_test, test_dataset = random_split(dataset, [train_len, valid_len, test_len], # generator=torch.Generator().manual_seed(42)) train_dataset, valid_test, test_dataset = random_split( dataset, [train_len, valid_len, test_len] ) # Better vizualization # train_valid_dataset = Subset(dataset, list(range(train_len+valid_len))) # test_dataset = Subset(dataset, list(range(train_len+valid_len, len(dataset)))) # # train_dataset, valid_dataset = random_split(train_valid_dataset, [train_len, valid_len]) # Initialize the dataloaders trainloader = DataLoader( train_dataset, batch_size=parameters.batch_size, shuffle=True, num_workers=1, ) validloader = DataLoader( valid_test, batch_size=parameters.valid_batch_size, shuffle=True, num_workers=1, ) testloader = DataLoader( test_dataset, batch_size=parameters.test_batch_size, shuffle=False, num_workers=1, ) # Get the n_times dimension with torch.no_grad(): # Changes if RPS integration or not if rps: x, _, _ = iter(trainloader).next() else: x, _ = iter(trainloader).next() n_times = x.shape[-1] # Initialize network # net = LeNet5(n_times) # net = ResNet([2, 2, 2], 64, n_times) # net = SCNN(parameters.s_n_layer, # parameters.s_kernel_size, # parameters.t_n_layer, # parameters.t_kernel_size, # n_times, # parameters.ff_n_layer, # parameters.ff_hidden_channels, # parameters.dropout, # parameters.max_pooling, # parameters.activation) # net = MNet(n_times) # net = RPS_SCNN(parameters.s_n_layer, # parameters.s_kernel_size, # parameters.t_n_layer, # parameters.t_kernel_size, # n_times, # parameters.ff_n_layer, # parameters.ff_hidden_channels, # parameters.dropout, # parameters.max_pooling, # parameters.activation) net = RPS_MNet(n_times) # net = RPS_MLP() mlp = False print(net) # Training loop or model loading if not skip_training: print("Begin training....") # Check the optimizer before running (different from model to model) optimizer = Adam(net.parameters(), lr=parameters.lr, weight_decay=5e-4) # optimizer = SGD(net.parameters(), lr=parameters.lr, weight_decay=5e-4) scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=15) print("scheduler : ", scheduler) loss_function = torch.nn.MSELoss() start_time = timer.time() if rps: if mlp: net, train_loss, valid_loss = train_bp_MLP( net, trainloader, validloader, optimizer, scheduler, loss_function, parameters.device, parameters.epochs, parameters.patience, parameters.hand, model_path, ) else: net, train_loss, valid_loss = train_bp( net, trainloader, validloader, optimizer, scheduler, loss_function, parameters.device, parameters.epochs, parameters.patience, parameters.hand, model_path, ) else: net, train_loss, valid_loss = train( net, trainloader, validloader, optimizer, scheduler, loss_function, parameters.device, parameters.epochs, parameters.patience, parameters.hand, model_path, ) train_time = timer.time() - start_time print("Training done in {:.4f}".format(train_time)) # visualize the loss as the network trained fig = plt.figure(figsize=(10, 4)) plt.plot( range(1, len(train_loss) + 1), train_loss, label="Training Loss" ) plt.plot( range(1, len(valid_loss) + 1), valid_loss, label="Validation Loss" ) # find position of lowest validation loss minposs = valid_loss.index(min(valid_loss)) + 1 plt.axvline( minposs, linestyle="--", color="r", label="Early Stopping Checkpoint", ) plt.xlabel("epochs") plt.ylabel("loss") # plt.ylim(0, 0.5) # consistent scale # plt.xlim(0, len(train_loss)+1) # consistent scale plt.grid(True) plt.legend() plt.tight_layout() plt.show() image1 = fig plt.savefig(os.path.join(figure_path, "loss_plot.pdf")) if not skip_training: # Save the trained model save_pytorch_model(net, model_path, "Baselinemodel_SCNN_swap.pth") else: # Load the model (properly select the model architecture) net = RPS_MNet() net = load_pytorch_model( net, os.path.join(model_path, "model.pth"), parameters.device ) # Evaluation print("Evaluation...") net.eval() y_pred = [] y = [] # if RPS integration with torch.no_grad(): if rps: if mlp: for _, labels, bp in testloader: labels, bp = labels.to(parameters.device), bp.to(device) y.extend(list(labels[:, parameters.hand])) y_pred.extend((list(net(bp)))) else: for data, labels, bp in testloader: data, labels, bp = ( data.to(parameters.device), labels.to(parameters.device), bp.to(device), ) y.extend(list(labels[:, parameters.hand])) y_pred.extend((list(net(data, bp)))) else: for data, labels in testloader: data, labels = ( data.to(parameters.device), labels.to(parameters.device), ) y.extend(list(labels[:, parameters.hand])) y_pred.extend((list(net(data)))) print("SCNN_swap...") # Calculate Evaluation measures mse = mean_squared_error(y, y_pred) rmse = mean_squared_error(y, y_pred, squared=False) mae = mean_absolute_error(y, y_pred) r2 = r2_score(y, y_pred) print("mean squared error {}".format(mse)) print("root mean squared error {}".format(rmse)) print("mean absolute error {}".format(mae)) print("r2 score {}".format(r2)) # plot y_new against the true value focus on 100 timepoints fig, ax = plt.subplots(1, 1, figsize=[10, 4]) times = np.arange(100) ax.plot(times, y_pred[0:100], color="b", label="Predicted") ax.plot(times, y[0:100], color="r", label="True") ax.set_xlabel("Times") ax.set_ylabel("{}".format(parameters.y_measure)) ax.set_title( "Sub {}, hand {}, {} prediction".format( str(parameters.subject_n), "sx" if parameters.hand == 0 else "dx", parameters.y_measure, ) ) plt.legend() plt.savefig(os.path.join(figure_path, "Times_prediction_focus.pdf")) plt.show() # plot y_new against the true value fig, ax = plt.subplots(1, 1, figsize=[10, 4]) times = np.arange(len(y_pred)) ax.plot(times, y_pred, color="b", label="Predicted") ax.plot(times, y, color="r", label="True") ax.set_xlabel("Times") ax.set_ylabel("{}".format(parameters.y_measure)) ax.set_title( "Sub {}, hand {}, {} prediction".format( str(parameters.subject_n), "sx" if parameters.hand == 0 else "dx", parameters.y_measure, ) ) plt.legend() plt.savefig(os.path.join(figure_path, "Times_prediction.pdf")) plt.show() # scatterplot y predicted against the true value fig, ax = plt.subplots(1, 1, figsize=[10, 4]) ax.scatter(np.array(y), np.array(y_pred), color="b", label="Predicted") ax.set_xlabel("True") ax.set_ylabel("Predicted") # plt.legend() plt.savefig(os.path.join(figure_path, "Scatter.pdf")) plt.show() # log the model and parameters using mlflow tracker with mlflow.start_run(experiment_id=args.experiment) as run: for key, value in vars(parameters).items(): mlflow.log_param(key, value) mlflow.log_param("Time", train_time) mlflow.log_metric("MSE", mse) mlflow.log_metric("RMSE", rmse) mlflow.log_metric("MAE", mae) mlflow.log_metric("R2", r2) mlflow.log_artifact(os.path.join(figure_path, "Times_prediction.pdf")) mlflow.log_artifact( os.path.join(figure_path, "Times_prediction_focus.pdf") ) mlflow.log_artifact(os.path.join(figure_path, "loss_plot.pdf")) mlflow.log_artifact(os.path.join(figure_path, "Scatter.pdf")) mlflow.pytorch.log_model(net, "models")
def nni_single_shot_neural_architecture_search(hp: HYPERPARAMS_T, model: torch.nn.Module, losses: LOSS_FN_TERMS_T, datasets: Tuple[Dataset], opt: Type[torch.optim.Optimizer], backend_conf: 'ignite_training.BackendConfig' = None, loss_weights: LOSS_TERMS_WEIGHTS_T = None, metrics: Dict[str, METRIC_FN_T] = {}, callbacks_handler: deepcv.utils.EventsHandler = None, final_architecture_path: Union[str, Path] = None) -> Tuple[METRICS_DICT_T, Optional[Path], str]: """ Train model with provided NAS trainer in order to find out the best NN architecture by training a superset NN instead of performing multiple trainings/trials for each/many possible architectures. Args: - hp: Hyperparameter dict, see ```deepcv.meta.ignite_training._check_params`` to see required and default training (hyper)parameters - model: Pytorch ``torch.nn.Module`` to train - losses: Loss(es) module(s) and/or callables to be used as training criterion(s) (may be a single loss function/module, a sequence of loss functions or a mapping of loss function(s) assiciated to their respective loss name(s)). .. See `loss_weights` argument for more details on multiple loss terms usage and weighting. - datasets: Tuple of pytorch Dataset giving access to trainset, validset and an eventual testset - opt: Optimizer type to be used for gradient descent - backend_conf: Backend information defining distributed configuration (available GPUs, whether if CPU or GPU are used, distributed node count, ...), see ``deepcv.meta.ignite_training.BackendConfig`` class for more details. - loss_weights: Optional weight/scaling/importance vector or sequence to be applied to each loss terms (defaults to 1. when `None`). This argument should contain as many scalars as there are loss terms/functions in `losses` argument. All weight values are casted to `torch.float32` and L1-norm (sum) should be different from zero due to the mean operator (loss terms ponderated sum is diveded by L1-norm of weights). NOTE: Each scalar in `loss_weights` weights its respective loss term so that the total loss on which model is trained is the ponderated mean of each loss terms (e.g. when `loss_weights` contains `1.` values for each loss term, then the final loss is the mean of each of those. Another example: if `loss_weights` only contains `len()` values, then the loss on which model is trained is the sum of each terms) NOTE: You may provide a mapping of weights instead of a Sequence in case you need to apply weights/factors to their respective term identified by their names (`losses` should also be a mapping in this case) - metrics: Additional metrics dictionnary (loss is already included in metrics to be evaluated by default) - callbacks_handler: Callbacks Events handler. If not `None`, events listed in `deepcv.meta.ignite_training.TRAINING_EVENTS` will be fired at various steps of training process, allowing to extend `deepcv.meta.ignite_training.train` functionnalities ("two-way" callbacks ). - final_architecture_path: File path where the final/optimal fixed architecture found by Single-Shot NAS algorithm have to be exported (JSON file which contains NNI NAS Mutable choices needed to obtain the fixed architecture from the model search space). Returns a pathlib.Path to a JSON file storing the best NN model architecture found by NNI Single-Shot NAS (JSON file storing mutable layer(s)/input(s) choices made in model search space in order to define best fixed architeture found; This file is also logged to mlflow if there is an active run). NOTE: Support for SPOS SingleShot NAS is untested and may be partial for now, see [NNI NAS SPOS documentation](https://nni.readthedocs.io/en/latest/NAS/SPOS.html) for more details on Evolution Tuner usage to find best model architecture. *To-Do List* - # TODO: convert ignite metrics for NNI NAS trainer usage if needed (to Callable[['outout', 'target'], Dict[str, float]]) - # TODO: reuse code from ignite training for output path and log final architecture as mlflow artifact - # TODO: Allow resuming an NNI single shot NAS experiment throught 'hp['resume_from']' parameter (if possible easyly using NNI API?) - # TODO: Add support for two-way callbacks using deepcv.utils.EventsHandler in a similar way than ignite_training.train (once ignite_training fully support it) """ from .ignite_training import BackendConfig if backend_conf is None: backend_conf = BackendConfig() experiment_name, run_id = get_nni_or_mlflow_experiment_and_trial() run_info_msg = f'(Experiment: "{experiment_name}", run_id: "{run_id}")' logging.info(f'Starting Single-Shot Neural Architecture Search (NNI NAS API) training over NN architecture search space {run_info_msg}.') TRAINING_HP_DEFAULTS = {'optimizer_opts': ..., 'epochs': ..., 'batch_size': None, 'nni_single_shot_nas_algorithm': ..., 'output_path': Path.cwd() / 'data' / '04_training', 'log_output_dir_to_mlflow': True, 'log_progress_every_iters': 100, 'seed': None, 'resume_from': '', 'deterministic_cudnn': False, 'nas_mutator': None, 'nas_mutator_kwarg': dict(), 'nas_trainer_kwargs': dict()} hp, _ = hyperparams.to_hyperparameters(hp, TRAINING_HP_DEFAULTS, raise_if_missing=True) deepcv.utils.setup_cudnn(deterministic=hp['deterministic_cudnn'], seed=backend_conf.rank + hp['seed']) # In distributed setup, we need to have different seed for each workers model = model.to(backend_conf.device, non_blocking=True) loss = loss.to(backend_conf.device) if isinstance(loss, torch.nn.Module) else loss num_workers = max(1, (backend_conf.ncpu - 1) // (backend_conf.ngpus_current_node if backend_conf.ngpus_current_node > 0 and backend_conf.distributed else 1)) optimizer = opt(model.parameters(), **hp['optimizer_opts']) trainset, *validset_testset = datasets output_path = ingite_training.add_training_output_dir(hp['output_path'], backend_conf, prefix='single_shot_nas_') # TODO: use this output_path in trainer and export final architecture to this directory too # TODO: use ingite_training function to setup distributed training? # Creates HP scheduler from hp if respective hp arguments have been provided by user scheduler = None if hp['scheduler'] is not None: args_to_eval = hp['scheduler']['eval_args'] if 'eval_args' in hp['scheduler'] else {} scheduler_kwargs = {n: eval(v, {'hp': hp, 'iterations': len(trainset)}) if n in args_to_eval else v for n, v in hp['scheduler']['kwargs'].items()} scheduler = nni.nas.pytorch.callbacks.LRSchedulerCallback(scheduler=hp['scheduler']['type'](optimizer=optimizer, **scheduler_kwargs)) nas_trainer_callbacks = [scheduler, ] # TODO: ... add user provided callbacks if not is_nni_run_standalone() and not is_nni_gen_search_space_mode(): class _ReportToNNICallback(nni.nas.pytorch.callbacks.Callback): def __init__(self, epochs=hp['epochs']): self.epochs = epochs def on_epoch_end(self, epoch): # TODO: find a way to retreive metrics or evaluate model on my own (meters = AverageMeterGroup() ...), see https://nni.readthedocs.io/en/latest/_modules/nni/nas/pytorch/enas/trainer.html meters = ... if epoch >= self.epochs: nni.report_final_result(meters) else: nni.report_intermediate_result(meters) nas_trainer_callbacks.append(_ReportToNNICallback(hp)) nas_trainer_kwargs = hp['nas_trainer_kwargs'] nas_mutator = hp['nas_mutator'] if nas_mutator is not None: if isinstance(nas_mutator, str): nas_mutator = deepcv.utils.get_by_identifier(nas_mutator) if not issubclass(nas_mutator, nni.nas.pytorch.base_mutator.BaseMutator): raise TypeError('Error: NNI SingleShot NAS Mutator argument "nas_mutator" must either be a "nni.nas.pytorch.mutables.Mutator" Type or a string identifier which resolves to a "nni.nas.pytorch.mutables.Mutator" Type.') nas_trainer_kwargs['mutator'] = nas_mutator(**hp['nas_mutator_kwarg']) # Instanciate user-provided mutator if hp['batch_size'] is not None: nas_trainer_kwargs['batch_size'] = hp['batch_size'] train_type = NNI_SINGLE_SHOT_NAS_ALGORITHMS[hp['nni_single_shot_nas_algorithm']] trainer = train_type(model=model, loss=loss, metrics=metrics, optimizer=optimizer, num_epochs=hp['epochs'], trainset=trainset, validset=validset_testset[0], num_workers=num_workers, device=backend_conf.device, log_frequency=hp['log_progress_every_iters'], callbacks=nas_trainer_callbacks, **nas_trainer_kwargs) # Train model with provided NAS trainer in order to find out the best NN architecture by training a superset NN instead of performing multiple trainings/trials for each/many possible architectures trainer.train() logging.info(f'Single-shot NAS training done. Validating model architecture... {run_info_msg}') trainer.validate() logging.info(f'Saving obtained NN architecture from NNI Single-Shot NAS algorithm as a JSON file and logging it to mlfow if possible... {run_info_msg}') # Print resulting architecture as a JSON string and save it to a JSON file if `final_architecture_path` isn't `None` (and is valid) architecture_choices = trainer.mutator.export() json_choices = deepcv.utils.replace_newlines(json.dumps(architecture_choices, indent=2, sort_keys=True, cls=nni.nas.pytorch.trainer.TorchTensorEncoder)) logging.info(f'Final/best NN architeture obtained from NNI Single-Shot NAS wont be saved to a JSON file as `final_architecture_path` is `None`. {run_info_msg}{NL}' f'NAS Mutable choices:{NL}' f'``` json{NL}{json_choices}{NL}```') logging.info(f'Saving final/best NN architeture obtained from NNI Single-Shot NAS (and may log it to MLFLow artifacts). {run_info_msg}') if final_architecture_path is not None: final_architecture_path = Path(final_architecture_path) with final_architecture_path.open(mode='w', newline=NL) as json_file: # 'w' mode will replace any existing file json_file.write(json_choices) # export the final architecture to a JSON file if mlflow.active_run() is not None: if final_architecture_path is not None: mlflow.log_artifact(str(final_architecture_path)) mlflow.set_tag('final_single_shot_nas_architecture_path', str(final_architecture_path)) mlflow.log_param('final_single_shot_nas_architecture', json_choices) logging.info(f'Single-Shot NAS trainning procedure completed. {run_info_msg}') return (..., final_architecture_path, architecture_choices) # TODO: return 'meters' metrics resulting from best evaluation on validset
# Instantiating model with model parameters model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio) # Fitting training data to the model model.fit(X_train, y_train) # Running prediction on validation dataset preds = model.predict(X_val) # Getting metrics on the validation dataset rmse = mean_squared_error(preds, y_val) abs_error = mean_absolute_error(preds, y_val) r2 = r2_score(preds, y_val) # Logging params and metrics to MLFlow mlflow.log_param('alpha', alpha) mlflow.log_param('l1_ratio', l1_ratio) mlflow.log_metric('rmse', rmse) mlflow.log_metric('abs_error', abs_error) mlflow.log_metric('r2', r2) # Logging training data mlflow.log_artifact(local_path='../data/wine/train.csv') # Logging training code mlflow.log_artifact(local_path='./mlflow-wine.py') # Logging model to MLFlow mlflow.sklearn.log_model(sk_model=model, artifact_path='wine-pyfile-model', registered_model_name='wine-pyfile-model')
from tensorflow.keras.models import Model import os import mlflow from random import random, randint from mlflow import log_metric, log_param, log_artifacts #pyfunc import mlflow.tensorflow print(tf.__version__) if __name__ == "__main__": # Log a parameter (key-value pair) mlflow.set_tracking_uri("http://0.0.0.0:7777") #mlflow-server:7777 # mlflow.set_experiment("/my-experiment") log_param("param1", randint(0, 100)) # Log a metric; metrics can be updated throughout the run log_metric("foo", random()) log_metric("foo", random() + 1) log_metric("foo", random() + 2) # Load in the data fashion_mnist = tf.keras.datasets.fashion_mnist (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data() x_train, x_test = x_train / 255.0, x_test / 255.0 # Increase one dimension so it can be used by the 2D convolutional keras layer x_train = np.expand_dims(x_train, -1) x_test = np.expand_dims(x_test, -1)
def log_param(key, val): mlflow.log_param(key, val)
import mlflow import os import time from mlflow import log_metric, log_param, log_artifact if __name__ == "__main__": mlflow.set_experiment("First") with mlflow.start_run(): # Log a parameter (key-value pair) log_param("param1", 5) # Log a metric; metrics can be updated throughout the run for i in range(200): time.sleep(0.1) log_metric("foo1", 1 * i) log_metric("foo2", 2 * i) log_metric("foo3", 3 * i) log_metric("foo4", 3 * i) log_metric("foo5", 3 * i) log_metric("foo6", 3 * i) log_metric("foo7", 3 * i) log_metric("foo8", 3 * i) log_metric("foo9", 3 * i) log_metric("foo10", 3 * i) log_metric("foo11", 3 * i) log_metric("foo12", 3 * i) log_metric("foo13", 3 * i) log_metric("foo14", 3 * i) log_metric("foo15", 3 * i) log_metric("foo16", 3 * i)
get_ipython().system(' pip install --quiet mlflow==1.12.1 neptune-mlflow==0.2.5 neptune-client==0.4.132') get_ipython().system(' pip install --quiet --upgrade mlflow neptune-mlflow neptune-client') ## Create some MLflow runs import os from random import random, randint import mlflow # start a run mlflow.start_run() # Log a parameter (key-value pair) mlflow.log_param("param1", randint(0, 100)) # Log a metric; metrics can be updated throughout the run mlflow.log_metric("foo", random()) mlflow.log_metric("foo", random()+1) mlflow.log_metric("foo", random()+2) mlflow.log_metric("foo", random()+3) mlflow.log_metric("bar", random()) mlflow.log_metric("bar", random()+1) mlflow.log_metric("bar", random()+2) mlflow.log_metric("bar", random()+3) # Log an artifact (output file) os.makedirs("outputs", exist_ok=True) with open("outputs/test.txt", "w") as f: