"wasbs://feature_engineereddata@stmyappdataopstrstage.blob.core.windows.net/" ) dbutils.widgets.text("feature_engineered_blob_config", "fs.azure.account.key.MYACCOUNT.blob.core.windows.net") dbutils.widgets.text("feature_engineered_blob_secretname", "MYCONTAINER@MYACCOUNT") # COMMAND ---------- # Connect to Azure ML dbutils.library.installPyPI("azureml-sdk", version="1.0.85", extras="databricks") from azureml.core import Run # In an Azure ML run, settings get imported from passed --AZUREML_* parameters run = Run.get_context(allow_offline=True) # COMMAND ---------- # Set up storage credentials spark.conf.set( dbutils.widgets.get("training_blob_config"), dbutils.secrets.get(scope=dbutils.widgets.get("secretscope"), key=dbutils.widgets.get("training_blob_secretname")), ) spark.conf.set( dbutils.widgets.get("feature_engineered_blob_config"), dbutils.secrets.get( scope=dbutils.widgets.get("secretscope"),
import os import argparse from azureml.core import Run parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, help='Name under which model will be registered') parser.add_argument('--model_path', type=str, help='Model directory') args, _ = parser.parse_known_args() print(f'Arguments: {args}') model_name = args.model_name model_path = args.model_path # current run is the registration step current_run = Run.get_context() # parent run is the overall pipeline parent_run = current_run.parent print(f'Parent run id: {parent_run.id}') # Upload models to pipeline artifacst and register a model from them parent_run.upload_folder(name='models', path=model_path) parent_run.register_model(model_path='models', model_name=model_name)
def init(): # Set Arguments. These should be all of the hyperparameters you will tune. global args parser = argparse.ArgumentParser() # Hyperparameters parser.add_argument('--eta', type=float, default=0.1, help='Learning Rate') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning Rate') parser.add_argument( '--scale_pos_weight', type=float, default=0.6, help= 'Helps with Unbalanced Classes. Should be Sum(Negative)/Sum(Positive)' ) parser.add_argument('--booster', type=str, default='gbtree', help='The type of Boosting Algorithim') parser.add_argument('--min_child_weight', type=float, default=1, help='Controls Overfitting') parser.add_argument('--max_depth', type=int, default=6, help='Controls Overfitting') parser.add_argument('--gamma', type=float, default=0, help='Make Algorithm Conservative') parser.add_argument('--subsample', type=float, default=1, help='Controls Overfitting') parser.add_argument('--colsample_bytree', type=float, default=1, help='Defines Sampling') parser.add_argument('--reg_lambda', type=float, default=1, help='Controls Overfitting') parser.add_argument('--alpha', type=float, default=0, help='Reduces Dimensionality') # parser.add_argument('--objective', type=str, default='binary:logistic',reg:logistic,multi:softmax parser.add_argument('--objective', type=str, default='multi:softmax', help='Defines Training Objective Metric') # Other Parameters parser.add_argument('--train_dataset_name', type=str, help='Name of Training Dataset') parser.add_argument('--val_dataset_name', type=str, help='Name of Validation Dataset') parser.add_argument('--target_column_name', type=str, help='Name of variable to score') parser.add_argument( '--k_folds', type=int, default=10, help='Number of folds to split your data into for cross validation') parser.add_argument( '--shuffle_split_size', type=float, help= 'Percentage of data to hold out for testing during cross validation') parser.add_argument( '--confidence_level', type=float, default=0.95, help='Level of confidence to set for your confidence interval ()') args = parser.parse_args() print(args) # Set the Run context for logging global run run = Run.get_context() # log your hyperparameters, run.log('eta', np.float(args.eta)) run.log('learning_rate', np.float(args.learning_rate)) run.log('scale_pos_weight', np.float(args.scale_pos_weight)) run.log('booster', np.str(args.booster)) run.log('min_child_weight', np.float(args.min_child_weight)) run.log('max_depth', np.float(args.max_depth)) run.log('gamma', np.float(args.gamma)) run.log('subsample', np.float(args.subsample)) run.log('colsample_bytree', np.float(args.colsample_bytree)) run.log('reg_lambda', np.float(args.reg_lambda)) run.log('alpha', np.float(args.alpha)) run.log('objective', np.str(args.objective))
from azureml.core import Run run = Run.get_context() # get hold of the current run import argparse, numpy as np, os # let user feed in 4 parameters: the location of the data files (container+folder from datastore), # the regularization rate of the logistic regression algorythm and the model name parser = argparse.ArgumentParser() parser.add_argument('--reg', type=float, help='regularization ate') parser.add_argument('--datapreparation_output', type=str, help='datapreparation_output') parser.add_argument('--datatrain_output', type=str, help='datatrain_output') parser.add_argument('--is_directory', type=bool, help='is_directory') args = parser.parse_args() reg = args.reg print('Regularization Rate:', reg) run.log('Regularization Rate', reg) datapreparation_output = args.datapreparation_output print('datapreparation_output:', datapreparation_output) run.log('datapreparation_output', datapreparation_output) datatrain_output = args.datatrain_output print('datatrain_output:', datatrain_output) run.log('datatrain_output', datatrain_output) is_directory = args.is_directory print('is_directory:', is_directory) run.log('is_directory', is_directory)
def train_model(df, target): # Creating dummy columns for each categorical feature categorical = [] for col, value in df.iteritems(): if value.dtype == 'object': categorical.append(col) # Store the numerical columns in a list numerical numerical = df.columns.difference(categorical) numeric_transformations = [ ([f], Pipeline(steps=[('imputer', SimpleImputer( strategy='median')), ('scaler', StandardScaler())])) for f in numerical ] categorical_transformations = [([f], OneHotEncoder(handle_unknown='ignore', sparse=False)) for f in categorical] transformations = numeric_transformations + categorical_transformations # Append classifier to preprocessing pipeline clf = Pipeline(steps=[('preprocessor', DataFrameMapper(transformations) ), ('classifier', LogisticRegression(solver='lbfgs'))]) # Split data into train and test x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.35, random_state=0, stratify=target) clf.fit(x_train, y_train) y_pred = clf.predict(x_test) print(classification_report(y_test, y_pred)) accu = accuracy_score(y_test, y_pred) model_file_name = 'classifier.pkl' # save model in the outputs folder so it automatically get uploaded with open(model_file_name, 'wb') as file: joblib.dump(value=clf, filename=os.path.join('./outputs/', model_file_name)) run = Run.get_context() run.log("accuracy", accu) # we upload the model into the experiment artifact store, but do not register it as a model until unit tests are sucessfully passed in next ML step run.upload_file(model_file_name, os.path.join('./outputs/', model_file_name)) #Interpret steps client = ExplanationClient.from_run(run) # Using SHAP TabularExplainer explainer = TabularExplainer(clf.steps[-1][1], initialization_examples=x_train, features=df.columns, classes=["Not leaving", "leaving"], transformations=transformations) # explain overall model predictions (global explanation) global_explanation = explainer.explain_global(x_test) # Sorted SHAP values print('ranked global importance values: {}'.format( global_explanation.get_ranked_global_values())) # Corresponding feature names print('ranked global importance names: {}'.format( global_explanation.get_ranked_global_names())) # Feature ranks (based on original order of features) print('global importance rank: {}'.format( global_explanation.global_importance_rank)) # uploading global model explanation data for storage or visualization in webUX # the explanation can then be downloaded on any compute # multiple explanations can be uploaded client.upload_model_explanation(global_explanation, comment='global explanation: all features')
def model_train(df): run = Run.get_context() df.drop("step", axis=1, inplace=True) df.drop("isFlaggedFraud", axis=1, inplace=True) # Dropping for demo reasons df.drop("nameOrig", axis=1, inplace=True) df.drop("nameDest", axis=1, inplace=True) y_raw = df['isFraud'] X_raw = df.drop('isFraud', axis=1) categorical_features = X_raw.select_dtypes(include=['object']).columns numeric_features = X_raw.select_dtypes(include=['int64', 'float']).columns categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value="missing")), ('onehotencoder', OneHotEncoder(categories='auto', sparse=False))]) numeric_transformer = Pipeline(steps=[ ('scaler', StandardScaler())]) feature_engineering_pipeline = ColumnTransformer( transformers=[ ('numeric', numeric_transformer, numeric_features), ('categorical', categorical_transformer, categorical_features) ], remainder="drop") # Encode Labels le = LabelEncoder() encoded_y = le.fit_transform(y_raw) # Train test split X_train, X_test, y_train, y_test = train_test_split(X_raw, encoded_y, test_size=0.20, stratify=encoded_y, random_state=42) # Create sklearn pipeline clf = Pipeline(steps=[('preprocessor', feature_engineering_pipeline), ('classifier', LogisticRegression(solver="saga", max_iter=250))]) # Train the model clf.fit(X_train, y_train) # Capture metrics train_acc = clf.score(X_train, y_train) test_acc = clf.score(X_test, y_test) print("Training accuracy: %.3f" % train_acc) print("Testing accuracy: %.3f" % test_acc) # Log to Azure ML run.log('Train accuracy', train_acc) run.log('Test accuracy', test_acc) # Explain model explainer = TabularExplainer(clf.steps[-1][1], initialization_examples=X_train, features=X_raw.columns, classes=["NotFraud", "Fraud"], transformations=feature_engineering_pipeline) # explain overall model predictions (global explanation) global_explanation = explainer.explain_global(X_test) # Sorted SHAP values print('ranked global importance values: {}'.format(global_explanation.get_ranked_global_values())) # Corresponding feature names print('ranked global importance names: {}'.format(global_explanation.get_ranked_global_names())) # Feature ranks (based on original order of features) print('global importance rank: {}'.format(global_explanation.global_importance_rank)) client = ExplanationClient.from_run(run) client.upload_model_explanation(global_explanation, comment='Global Explanation: All Features') return clf
def main(): run = Run.get_context() if (run.id.startswith('OfflineRun')): from dotenv import load_dotenv sys.path.append(os.path.abspath("./code/util")) # NOQA: E402 from model_helper import get_model_by_build_id # For local development, set values in this section load_dotenv() workspace_name = os.environ.get("WORKSPACE_NAME") experiment_name = os.environ.get("EXPERIMENT_NAME") resource_group = os.environ.get("RESOURCE_GROUP") subscription_id = os.environ.get("SUBSCRIPTION_ID") tenant_id = os.environ.get("TENANT_ID") model_name = os.environ.get("MODEL_NAME") app_id = os.environ.get('SP_APP_ID') app_secret = os.environ.get('SP_APP_SECRET') build_id = os.environ.get('BUILD_BUILDID') service_principal = ServicePrincipalAuthentication( tenant_id=tenant_id, service_principal_id=app_id, service_principal_password=app_secret) aml_workspace = Workspace.get(name=workspace_name, subscription_id=subscription_id, resource_group=resource_group, auth=service_principal) ws = aml_workspace exp = Experiment(ws, experiment_name) run_id = "bd184a18-2ac8-4951-8e78-e290bef3b012" else: sys.path.append(os.path.abspath("./util")) # NOQA: E402 from model_helper import get_model_by_build_id ws = run.experiment.workspace exp = run.experiment run_id = 'amlcompute' parser = argparse.ArgumentParser("register") parser.add_argument( "--build_id", type=str, help="The Build ID of the build triggering this pipeline run", ) parser.add_argument( "--run_id", type=str, help="Training run ID", ) parser.add_argument( "--model_name", type=str, help="Name of the Model", default="sklearn_regression_model.pkl", ) parser.add_argument( "--validate", type=str, help="Set to true to only validate if model is registered for run", default=False, ) args = parser.parse_args() if (args.build_id is not None): build_id = args.build_id if (args.run_id is not None): run_id = args.run_id if (run_id == 'amlcompute'): run_id = run.parent.id if (args.validate is not None): validate = args.validate model_name = args.model_name if (validate): try: get_model_by_build_id(model_name, build_id, exp.workspace) print("Model was registered for this build.") except Exception as e: print(e) print("Model was not registered for this run.") sys.exit(1) else: if (build_id is None): register_aml_model(model_name, exp, run_id) else: run.tag("BuildId", value=build_id) register_aml_model(model_name, exp, run_id, build_id)
# Copyright (c) Microsoft. All rights reserved. # Licensed under the MIT license. from azureml.core import Run submitted_run = Run.get_context() submitted_run.log(name="message", value="Hello from run!")
def main(): run = Run.get_context() if (run.id.startswith('OfflineRun')): from dotenv import load_dotenv sys.path.append(os.path.abspath("./code/util")) # NOQA: E402 from model_helper import get_model_by_tag # For local development, set values in this section load_dotenv() workspace_name = os.environ.get("WORKSPACE_NAME") experiment_name = os.environ.get("EXPERIMENT_NAME") resource_group = os.environ.get("RESOURCE_GROUP") subscription_id = os.environ.get("SUBSCRIPTION_ID") build_id = os.environ.get('BUILD_BUILDID') aml_workspace = Workspace.get( name=workspace_name, subscription_id=subscription_id, resource_group=resource_group ) ws = aml_workspace exp = Experiment(ws, experiment_name) else: sys.path.append(os.path.abspath("./util")) # NOQA: E402 from model_helper import get_model_by_tag ws = run.experiment.workspace exp = run.experiment parser = argparse.ArgumentParser("register") parser.add_argument( "--build_id", type=str, help="The Build ID of the build triggering this pipeline run", ) parser.add_argument( "--model_name", type=str, help="Name of the Model" ) parser.add_argument( "--output_model_version_file", type=str, help="Name of a file to write model version to" ) args = parser.parse_args() if (args.build_id is not None): build_id = args.build_id model_name = args.model_name try: tag_name = 'BuildId' model = get_model_by_tag( model_name, tag_name, build_id, exp.workspace) if (model is not None): print("Model was registered for this build.") if (model is None): print("Model was not registered for this run.") sys.exit(1) except Exception as e: print(e) print("Model was not registered for this run.") sys.exit(1) # Save the Model Version for other AzDO jobs after script is complete if args.output_model_version_file is not None: with open(args.output_model_version_file, "w") as out_file: out_file.write(str(model.version))
def init(): global ws current_run = Run.get_context() ws = current_run.experiment.workspace print("Init complete")
parser.add_argument("--date_column", type=str, help="date_column") parser.add_argument("--hour_column", type=str, help="hour_column") parser.add_argument("--datetime_column_name", type=str, help="datetime_column_name") parser.add_argument("--pivot_columns", type=str, help="pivot_columns") parser.add_argument("--value_column", type=str, help="value_column") parser.add_argument("--output", type=str, help="output") args = parser.parse_args() print("Date Column: %s" % args.date_column) print("Hour Column: %s" % args.hour_column) print("Datetime Column Name: %s" % args.datetime_column_name) print("Pivot Columns: %s" % args.pivot_columns) print("Value Column: %s" % args.value_column) print("Output: %s" % args.output) # Retrieve Input Dataset input_ds = Run.get_context().input_datasets["time_series"] # Read dataset as a DataFrame input_df = input_ds.to_pandas_dataframe() # NOTE: Ability to develop/work on samples from the original dataset (0.01 = 1% of full dataset) # input_df = input_dataset.take_sample(0.01).to_pandas_dataframe() # Generate timestamp column 'DATETIME' FROM date AND hour columns input_df[args.datetime_column_name] = input_df.apply(lambda x: gen_date(x[args.date_column], x[args.hour_column]), axis=1) # Drop date AND hour columns input_df = input_df.drop(columns=[args.date_column,args.hour_column]) # Pivot Data if args.pivot_columns: # pivot and set index to datetime output_df = pd.pivot_table(input_df, values=args.value_column, index=args.datetime_column_name, columns=args.pivot_columns, aggfunc=np.max) else:
from azureml.core import Run # The dataset is specified at the pipeline definition level. RANDOM_STATE = 42 parser = argparse.ArgumentParser() parser.add_argument('--X_train_dir', dest='X_train_dir', required=True) parser.add_argument('--X_test_dir', dest='X_test_dir', required=True) parser.add_argument('--y_train_dir', dest='y_train_dir', required=True) parser.add_argument('--y_test_dir', dest='y_test_dir', required=True) args = parser.parse_args() ds = Run.get_context().input_datasets['iris_baseline'] # Now the actual data prep (trivial) df = ds.to_pandas_dataframe() le = LabelEncoder() le.fit(df['species']) X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 1:4], le.transform( df['species']), test_size=0.2, random_state=RANDOM_STATE) # Write outputs as `OutputFileDatasetConfig` x_train_fname = os.path.join(args.X_train_dir, "data.txt") x_test_fname = os.path.join(args.X_test_dir, "data.txt") y_train_fname = os.path.join(args.y_train_dir, "data.txt")
def handle_arguments(arg_parser) -> argparse.Namespace: # data args arg_parser.add_argument("--data-folder", type=str) args = arg_parser.parse_args() return args def handle_configurations() -> Tuple[dict, dict, dict]: conf = load_training_conf("train_conf.yml") conf_train, conf_data = conf["training"], conf["data"] azure_conf = load_azure_conf("azure_conf.yml") return conf_train, conf_data, azure_conf if __name__ == "__main__": azure_run_context = Run.get_context() args = handle_arguments(argparse.ArgumentParser()) conf_train, conf_data, azure_conf = handle_configurations() csv_dataset_name = azure_conf["LOCAL_DATASET_PATH"].split(os.sep)[-1] (x_train, x_test, y_train, y_test), tokenizer = training_data( tickets_data_path=os.path.join(args.data_folder, csv_dataset_name), text_column=conf_data["text_column"], label_column=conf_data["label_column"], test_size=conf_train.get("test_set_size", 0.25), subset_size=-1, max_length=conf_data["max_words_per_message"], pad_to_max_length=conf_data.get("pad_to_max_length", True), ) model = DistilBertClassifier( num_labels=y_train.shape[1],
def init(): global current_run current_run = Run.get_context()
tokenizer.pad_token = tokenizer.eos_token model.config.pad_token_id = model.config.eos_token_id encoded_dataset_train, encoded_dataset_eval = load_encoded_glue_dataset( task=task, tokenizer=tokenizer) compute_metrics = construct_compute_metrics_function(args.task) trainer = Trainer( model, training_args, train_dataset=encoded_dataset_train, eval_dataset=encoded_dataset_eval, tokenizer=tokenizer, compute_metrics=compute_metrics, ) trainer.pop_callback(MLflowCallback) print("Training...") run = Run.get_context() # get handle on Azure ML run start = time.time() trainer.train() run.log("time/epoch", (time.time() - start) / 60 / training_args.num_train_epochs) print("Evaluation...") trainer.evaluate()
parser.add_argument('--regularization', type=float, dest='reg', default=0.01, help='regularization rate') args = parser.parse_args() data_folder = os.path.join(args.data_folder, 'mnist') print('Data folder:', data_folder) # load train and test set into numpy arrays # note we scale the pixel intensity values to 0-1 (by dividing it with 255.0) so the model can converge faster. X_train = load_data(os.path.join(data_folder, 'train-images.gz'), False) / 255.0 X_test = load_data(os.path.join(data_folder, 'test-images.gz'), False) / 255.0 y_train = load_data(os.path.join(data_folder, 'train-labels.gz'), True).reshape(-1) y_test = load_data(os.path.join(data_folder, 'test-labels.gz'), True).reshape(-1) print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, sep = '\n') # get hold of the current run run = Run.get_context() print('Train a logistic regression model with regularizaion rate of', args.reg) clf = LogisticRegression(C=1.0/args.reg, random_state=42) clf.fit(X_train, y_train) print('Predict the test set') y_hat = clf.predict(X_test) # calculate accuracy on the prediction acc = np.average(y_hat == y_test) print('Accuracy is', acc) run.log('regularization rate', np.float(args.reg)) run.log('accuracy', np.float(acc))
def main(): # ------------ # args # ------------ torch.manual_seed(0) pl.seed_everything(0) parser = argparse.ArgumentParser() parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point') parser.add_argument('--batch-size', type=int, dest='batch_size', default=50, help='mini batch size for training') parser.add_argument('--epoch', type=int, dest='epoch', default=10, help='epoch size for training') parser.add_argument('--learning-rate', type=float, dest='learning_rate', default=0.001, help='learning rate') parser.add_argument('--momentum', type=float, dest='momentum', default=0.9, help='momentum') parser.add_argument('--model-name', type=str, dest='model_name', default='resnet', help='Fine Turning model name') parser.add_argument('--optimizer', type=str, dest='optimizer', default='SGD', help='Optimzers to use for training.') parser.add_argument('--criterion', type=str, dest='criterion', default='cross_entropy', help='Loss Function to use for training.') parser.add_argument('--feature_extract', type=bool, dest='feature_extract', default=True, help='Flag for feature extracting. When False, we finetune the whole model, when True we only update the reshaped layer params') args = parser.parse_args() args.num_workers=8 data_folder = args.data_folder print('training dataset is stored here:', data_folder) input_size = 224 if args.model_name == "inception": input_size = 299 # --------------------------- # Azure Machnie Learning # 1) get Azure ML run context and log hyperparameters # --------------------------- run = Run.get_context() run.log('model_name', args.model_name) run.log('optimizer', args.optimizer) run.log('criterion', args.criterion) run.log('lr', np.float(args.learning_rate)) run.log('momentum', np.float(args.momentum)) # For your tagging # run.tag('description', 'xxx') # ------------ # data # ------------ transform = transforms.Compose([ # Augmentation # transforms.RandomHorizontalFlip(), # transforms.RandomVerticalFlip(), transforms.RandomAffine(degrees=[-10, 10], translate=(0.1, 0.1), scale=(0.5, 1.5)), transforms.RandomRotation(degrees=10), # Resize transforms.Resize(int(input_size * 1.3)), transforms.CenterCrop(input_size), # Tensor transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]) ] ) dataset = torchvision.datasets.ImageFolder(args.data_folder, transform) args.num_classes = len(dataset.classes) n_train = int(len(dataset) * 0.7) n_val = int(len(dataset) * 0.15) n_test = len(dataset) - n_train - n_val train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [n_train, n_val, n_test]) train_loader = torch.utils.data.DataLoader(train_dataset, args.batch_size, shuffle=True, drop_last=True, num_workers=args.num_workers) val_loader = torch.utils.data.DataLoader(val_dataset, args.batch_size, num_workers=args.num_workers) test_loader = torch.utils.data.DataLoader(test_dataset, args.batch_size) # Initialize the model for this run model_ft, input_size = initialize_model(args.model_name, args.num_classes, feature_extract=args.feature_extract , use_pretrained=True) model = FineTurningModel(args, model_ft) # GPU Configuration num_gpu = torch.cuda.device_count() print('num_gpu:', num_gpu) accelerator = None if num_gpu > 1: accelerator='ddp' # only for Single Machine # ------------ # training # ------------ trainer = pl.Trainer(max_epochs=args.epoch, gpus=num_gpu, accelerator=accelerator) trainer.fit(model, train_loader, val_loader) # ------------ # Test (Not Validation) # ------------ test_result = trainer.test(test_dataloaders=test_loader) test_result run.log('test_acc', [res["test_acc"] for res in test_result][0]) run.log('test_loss', [res["test_loss"] for res in test_result][0]) run.log('test_acc_epoch', [res["test_acc_epoch"] for res in test_result][0]) run.log('test_loss_epoch', [res["test_loss_epoch"] for res in test_result][0]) # ------------ # save model # ------------ outputdir = './outputs/model' os.makedirs(outputdir, exist_ok=True) torch.save(model.state_dict(), os.path.join(outputdir, 'model.dict')) torch.save(model, os.path.join(outputdir, 'model.pt'))