def register_aml_model( model_path, model_name, model_tags, exp, run_id, dataset_id, build_id: str = 'none', build_uri=None ): try: tagsValue = {"area": "sales_forecast", "run_id": run_id, "experiment_name": exp.name} tagsValue.update(model_tags) if (build_id != 'none'): model_already_registered(model_name, exp, run_id) tagsValue["BuildId"] = build_id if (build_uri is not None): tagsValue["BuildUri"] = build_uri model = AMLModel.register( workspace=exp.workspace, model_name=model_name, model_path=model_path, tags=tagsValue, datasets=[('training data', Dataset.get_by_id(exp.workspace, dataset_id))]) os.chdir("..") print( "Model registered: {} \nModel Description: {} " "\nModel Version: {}".format( model.name, model.description, model.version ) ) except Exception: traceback.print_exc(limit=None, file=None, chain=True) print("Model registration failed") raise
def get_or_register_dataset(dataset_name: str, datastore_name: str, data_file_path: str, aml_workspace: Workspace = None) -> Dataset: if dataset_name is None: raise Exception("Datset name can't be null") if aml_workspace is None: print("No workspace defined - using current experiment workspace.") aml_workspace, *_ = get_aml_context(Run.get_context()) if data_file_path == "nopath": print(f"get latest version of dataset: {dataset_name}") dataset = Dataset.get_by_name(aml_workspace, dataset_name) else: print( f"register a new dataset or new version: {dataset_name}, {datastore_name}, {data_file_path}" ) # NOQA: E501 dataset = register_dataset(aml_workspace, dataset_name, datastore_name, data_file_path) return dataset
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument( '--C', type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument('--data', type=str, help="Loading dataset") parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() # split data to train and test sets dataset = Dataset.get_by_name(ws, name='diabetes_data_set') dataset = dataset.to_pandas_dataframe() x = dataset.drop(columns=['Outcome']) y = dataset['Outcome'] x_train, x_test, y_train, y_test = train_test_split(x, y) run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) accuracy = model.score(x_test, y_test) os.makedirs('outputs', exist_ok=True) joblib.dump(model, 'outputs/model.joblib') run.log("Accuracy", np.float(accuracy))
def main(experiment, environment, dataset): workspace = Workspace.from_config() experiment = Experiment(workspace, experiment) compute_target = ComputeTarget(workspace, environment) # Use the root of the solution as source folder for the run. root_folder = Path(__file__).parent.parent # Provide each of the datasets to the estimator as a named input. # You can acccess these from within the training script. datasets = [Dataset.get_by_name(workspace, ds).as_named_input(ds) for ds in dataset] estimator = SKLearn( source_directory=root_folder, entry_script='customer_churn/train.py', conda_dependencies_file='conda_dependencies.yml', compute_target=compute_target, inputs=datasets ) run = experiment.submit(estimator) run.wait_for_completion(show_output=True)
def register_model(model_name, dataset_name, build_id): # Retreive dataset if run._run_id.startswith("OfflineRun"): workspace = Workspace.from_config() else: workspace = run.experiment.workspace # Retreive train datasets train_dataset = [(dataset_name, Dataset.get_by_name(workspace, name=dataset_name))] # Get evaluation metric for model run_metrics = run.parent.get_metrics() # Define model file name model_file_name = "model.pkl" # Define model tags model_tags = { "build_id": build_id, "test_acccuracy": run_metrics.get(evaluation_metric), } print("Variable [model_tags]:", model_tags) # Register the model model = run.parent.register_model( model_name=model_name, model_path=model_file_name, model_framework=Model.Framework.SCIKITLEARN, model_framework_version=sklearn.__version__, datasets=train_dataset, tags=model_tags, ) print("Variable [model]:", model.serialize()) logger.info(model.serialize())
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument("--input_data", type=str, help="Id of the registered train dataset") parser.add_argument('--n_estimators', type=int, default=100, help="Number of estimators") parser.add_argument('--max_depth', type=int, default=6, help="Maximum depth of the trees") args = parser.parse_args() run.log("Number of estimators:", np.float(args.n_estimators)) run.log("Max depth:", np.int(args.max_depth)) # Create TabularDataset dataset = Dataset.get_by_id(ws, id=args.input_data) X_train, X_test, y_train, y_test = clean_data(dataset) model = XGBClassifier(n_estimators=args.n_estimators, max_depth=args.max_depth).fit(X_train, y_train) #saving the model os.makedirs("outputs", exist_ok=True) filename = 'outputs/model.pkl' pickle.dump(model, open(filename, 'wb')) y_pred = model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) run.log("Accuracy", np.float(accuracy))
def register_aml_model(run_id, exp, model_tags, model_name, model_path, dataset_id, build_id=None, build_uri=None): try: tags_value = { 'area': 'diabetes_regression', 'run_id': run_id, 'experiment_name': exp.name } tags_value.update(model_tags) if build_id is not None: model_already_registered(model_name, run_id, exp) tags_value['BuildId'] = build_id if build_uri is not None: tags_value['BuildUri'] = build_uri model = Model.register(workspace=exp.workspace, model_path=model_path, tags=tags_value, model_name=model_name, datasets=[ ('training_data', Dataset.get_by_id(exp.workspace, dataset_id)) ]) print( f'{model_name} has been registered,\nmodel description: {model.description},\nmodel version: {model.version}' ) except Exception: traceback.print_exc(limit=None, file=None, chain=True) print('model registration failed!') raise
def scale_up(self, workers=1): """ Scale up the number of workers. """ run_config = RunConfiguration() run_config.target = self.compute_target run_config.environment = self.environment_definition scheduler_ip = self.run.get_metrics()["scheduler"] args = [ f"--scheduler_ip_port={scheduler_ip}", f"--use_gpu={self.use_gpu}", f"--n_gpus_per_node={self.n_gpus_per_node}", f"--worker_death_timeout={self.worker_death_timeout}", ] file_dataset_registered_name = self.kwargs.get( 'file_dataset_registered_name', None) dataset_config_name = self.kwargs.get('dataset_config_name', None) path_on_compute = self.kwargs.get('path_on_compute', None) if path_on_compute is not None: dataset = Dataset.get_by_name(workspace=self.workspace, name=file_dataset_registered_name) input1 = dataset.as_named_input(dataset_config_name).as_mount( path_on_compute=path_on_compute) args.append(input1) child_run_config = ScriptRunConfig( source_directory=os.path.join(self.abs_path, "setup"), script="start_worker.py", arguments=args, run_config=run_config, ) for i in range(workers): child_run = self.run.submit_child(child_run_config, tags=self.tags) self.workers_list.append(child_run) hostname = socket.gethostname()
def get_or_create_dataset(self, azure_dataset_id: str) -> FileDataset: """ Looks in the AzureML datastore for a dataset of the given name. If there is no such dataset, a dataset is created and registered, assuming that the files are in a folder that has the same name as the dataset. For example, if azure_dataset_id is 'foo', then the 'foo' dataset should be pointing to the folder <container_root>/datasets/foo/ """ if not self.azureml_datastore: raise ValueError( "No value set for 'azureml_datastore' (name of the datastore in the AzureML workspace)" ) if not azure_dataset_id: raise ValueError("No dataset ID provided.") logging.info( f"Retrieving datastore '{self.azureml_datastore}' from AzureML workspace" ) workspace = self.get_workspace() datastore = Datastore.get(workspace, self.azureml_datastore) try: logging.info( f"Trying to retrieve AzureML Dataset '{azure_dataset_id}'") azureml_dataset = Dataset.get_by_name(workspace, name=azure_dataset_id) logging.info("Dataset found.") except: logging.info( f"Dataset does not yet exist, creating a new one from data in folder '{azure_dataset_id}'" ) # Ensure that there is a / at the end of the file path, otherwise folder that share a prefix could create # trouble (for example, folders foo and foo_bar exist, and I'm trying to create a dataset from "foo") azureml_dataset = Dataset.File.from_files(path=(datastore, azure_dataset_id + "/")) logging.info("Registering the dataset for future use.") azureml_dataset.register(workspace, name=azure_dataset_id) return azureml_dataset
# Helper file to submit an experiment run import os from azureml.core import Workspace, Experiment, Dataset from azureml.core.model import Model from azureml.train.estimator import Estimator from azureml.core.authentication import AzureCliAuthentication from azureml.data.data_reference import DataReference # load Azure ML workspace azureml_workspace = Workspace.from_config(auth=AzureCliAuthentication()) # Retrieve a pointer to the dataset versions redditcomments_gaming = Dataset.get_by_name(azureml_workspace, name='redditcomments', version='latest') redditcomments = Dataset.get_by_name(azureml_workspace, name='redditcomments_gaming', version='latest') # Configure the training run est = Estimator(entry_script='train.py', script_params={'--alpha': 1.0}, source_directory=os.path.dirname(os.path.realpath(__file__)), compute_target='ml-e2e', inputs=[redditcomments_gaming.as_named_input('comments')], pip_packages=[ "azureml-sdk", "azureml-mlflow", "matplotlib", "scipy", "sklearn", "azure-cli", "pandas", "numpy" ])
df_filtered.shape[0], start_time, end_time)) return df_filtered print("Check for new data and prepare the data") parser = argparse.ArgumentParser("split") parser.add_argument("--ds_name", help="name of the Dataset to update") args = parser.parse_args() print("Argument 1(ds_name): %s" % args.ds_name) dstor = ws.get_default_datastore() register_dataset = False try: ds = Dataset.get_by_name(ws, args.ds_name) end_time_last_slice = ds.data_changed_time.replace(tzinfo=None) print("Dataset {0} last updated on {1}".format(args.ds_name, end_time_last_slice)) except Exception as e: print(traceback.format_exc()) print("Dataset with name {0} not found, registering new dataset.".format( args.ds_name)) register_dataset = True end_time_last_slice = datetime.today() - relativedelta(weeks=1) end_time = datetime.utcnow() train_df = get_noaa_data(end_time_last_slice, end_time) if train_df.size > 0: print("Received {0} rows of new data after {0}.".format(
from azureml.pipeline.core import Pipeline, PipelineData from azureml.data.dataset_consumption_config import DatasetConsumptionConfig from azureml.pipeline.steps import ParallelRunConfig from azureml.pipeline.steps import ParallelRunStep print("SDK version:", azureml.core.VERSION) dataset_name = 'grib-dataset' ws = Workspace.from_config() print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n') datastore = ws.get_default_datastore() input_ds = Dataset.get_by_name(ws, dataset_name) batch_data = DatasetConsumptionConfig("batch_dataset", input_ds, mode='mount') output_dir = PipelineData(name='batch_output', datastore=datastore) parallel_run_config = ParallelRunConfig.load_yaml(workspace=ws, path='convert_parallel.yml') batch_step = ParallelRunStep(name="batch-conversion-step", parallel_run_config=parallel_run_config, arguments=['--data_output_path', output_dir], inputs=[batch_data], output=output_dir, allow_reuse=False) steps = [batch_step]
def main(): print("Running train.py") parser = argparse.ArgumentParser("train") parser.add_argument( "--build_id", type=str, help="The build ID of the build triggering this pipeline run", ) parser.add_argument( "--model_name", type=str, help="Name of the Model", default="sklearn_regression_model.pkl", ) parser.add_argument( "--dataset_name", type=str, help=("Dataset with the training data") ) args = parser.parse_args() print("Argument [build_id]: %s" % args.build_id) print("Argument [model_name]: %s" % args.model_name) print("Argument [dataset_name]: %s" % args.dataset_name) model_name = args.model_name build_id = args.build_id dataset_name = args.dataset_name print("Getting training parameters") with open("config.json") as f: pars = json.load(f) try: alpha = pars["training"]["alpha"] except KeyError: alpha = 0.5 print("Parameter alpha: %s" % alpha) run = Run.get_context() ws = run.experiment.workspace if (dataset_name): dataset = Dataset.get_by_name(workspace=ws, name=dataset_name) df = dataset.to_pandas_dataframe() X = df.values y = df.Y else: X, y = load_diabetes(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0) data = {"train": {"X": X_train, "y": y_train}, "test": {"X": X_test, "y": y_test}} reg = train_model(run, data, alpha) joblib.dump(value=reg, filename=model_name) # upload model file explicitly into artifacts for parent run run.parent.upload_file(name="./outputs/" + model_name, path_or_stream=model_name) print("Uploaded the model {} to experiment {}".format( model_name, run.experiment.name)) dirpath = os.getcwd() print(dirpath) print("Following files are uploaded ") print(run.parent.get_file_names()) run.parent.tag("BuildId", value=build_id) # Add properties to identify this specific training run run.tag("BuildId", value=build_id) run.tag("run_type", value="train") builduri_base = os.environ.get("BUILDURI_BASE") if (builduri_base is not None): build_uri = builduri_base + build_id run.tag("BuildUri", value=build_uri) run.parent.tag("BuildUri", value=build_uri) print(f"tags now present for run: {run.tags}") run.complete()
import argparse from azureml.core import Dataset, Run parser = argparse.ArgumentParser() parser.add_argument("--input-data", type=str) args = parser.parse_args() run = Run.get_context() ws = run.experiment.workspace # get the input dataset by ID dataset = Dataset.get_by_id(ws, id=args.input_data) # load the TabularDataset to pandas DataFrame df = dataset.to_pandas_dataframe()
def save_model(model, model_name, output_folder): output_path = output_folder + '/{}.pkl'.format(model_name) joblib.dump(value=model, filename=output_path) def main(args, run, pd_dataset): logging.basicConfig(level=logging.INFO) pd_dataset = dataset.to_pandas_dataframe() X, y = pd_dataset.iloc[:, :-1], pd_dataset.iloc[:, -1] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=args.test_size, random_state=args.random_state_test) model = train_decision_tree(args.max_leaf_nodes, args.random_state_model, X_train, y_train) y_prediction = predict(model, X_test) model_accuracy = accuracy_score(y_true=y_test, y_pred=y_prediction) labels = y.unique() matrix = confusion_matrix(y_test, y_prediction, labels) save_model(model, args.model_name, args.output_folder) log_results(model_accuracy, args.output_folder, args.model_name, matrix, labels, run) if __name__ == '__main__': run = Run.get_context() args = get_parsed_args() makedirs(args.output_folder, exist_ok=True) dataset = Dataset.get_by_name(run.experiment.workspace, name=args.dataset_name) log_arguments(args, dataset, run) main(args, run, dataset) run.complete()
def main(): print("Running train_aml.py") parser = argparse.ArgumentParser() parser.add_argument( "--model_name", type=str, help="Name of the Model", default="COVID19Articles_model_github.pkl", ) parser.add_argument("--step_output", type=str, help=("output for passing data to next step")) parser.add_argument("--dataset_version", type=str, help=("dataset version")) parser.add_argument("--data_file_path", type=str, help=("data file path, if specified,\ a new version of the dataset will be registered")) parser.add_argument("--dataset_name", type=str, help=("Dataset name. Dataset must be passed by name\ to always get the desired dataset version\ rather than the one used while the pipeline creation")) args = parser.parse_args() print("Argument [model_name]: %s" % args.model_name) print("Argument [step_output]: %s" % args.step_output) print("Argument [dataset_version]: %s" % args.dataset_version) print("Argument [data_file_path]: %s" % args.data_file_path) print("Argument [dataset_name]: %s" % args.dataset_name) datastore_name = os.environ.get("DATASTORE_NAME") model_name = args.model_name step_output_path = args.step_output dataset_version = args.dataset_version data_file_path = args.data_file_path dataset_name = args.dataset_name run = Run.get_context() # Get the dataset if (dataset_name): if (data_file_path == ""): if (dataset_name in Dataset.get_all(run.experiment.workspace).keys()): dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, version=dataset_version) else: create_sample_data_csv(run.experiment.workspace, datastore_name) dataset = register_dataset(run.experiment.workspace, dataset_name, datastore_name) else: dataset = register_dataset(run.experiment.workspace, dataset_name, datastore_name, data_file_path) else: if (data_file_path == ""): data_file_path = "COVID19Articles.csv" create_sample_data_csv(run.experiment.workspace, datastore_name) dataset_name = "COVID19Articles_Training_githubactions" dataset = register_dataset(run.experiment.workspace, dataset_name, datastore_name, data_file_path) # Link dataset to the step run so it is trackable in the UI run.input_datasets['training_data'] = dataset # Split the data into test/train df = dataset.to_pandas_dataframe() data = split_data(df) class_args = {"max_depth": 5} # Train the model model = train_model(data, class_args) # Evaluate and log the metrics returned from the train function metrics = get_model_metrics(model, data) for (k, v) in metrics.items(): run.log(k, v) # files saved in the "outputs" folder are automatically uploaded into run history model_file_name = "COVID19Articles_model.pkl" joblib.dump(model, os.path.join('outputs', model_file_name)) run.tag("run_type", value="train") print(f"tags now present for run: {run.tags}") run.complete()
argparser.add_argument('--n_layers', type=int, default=2) argparser.add_argument('--learning_rate', type=float, default=0.01) argparser.add_argument('--chunk_len', type=int, default=200) argparser.add_argument('--batch_size', type=int, default=100) argparser.add_argument('--shuffle', action='store_true') argparser.add_argument('--cuda', action='store_true') args = argparser.parse_args() # TODO: Download the dataset you uploaded earlier by using # the Dataset class using the 'get_by_name' and 'download' methods. # Use the recieved file_path as input to the 'read_file()' function. # HINT: # In the 'get_by_name' method the name input-field should be 'name=args.dataset' # The filepath is a list and 'read_file' expect a string # https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.dataset.dataset?view=azure-ml-py dataset = Dataset.get_by_name(ws, name=args.dataset) file_path = dataset.download(target_path='.', overwrite=True) file, file_len = read_file(file_path[0]) # TODO: Input the file path here # Splitting dataset function def random_training_set(chunk_len, batch_size): inp = torch.LongTensor(batch_size, chunk_len) target = torch.LongTensor(batch_size, chunk_len) for bi in range(batch_size): start_index = random.randint(0, file_len - chunk_len) end_index = start_index + chunk_len + 1 chunk = file[start_index:end_index] if len(chunk[:-1]) < 200: continue inp[bi] = char_tensor(chunk[:-1])
from azureml.core import Workspace, Dataset subscription_id = 'REPLACE' resource_group = 'REPLACE' workspace_name = 'REPLACE' workspace = Workspace(subscription_id, resource_group, workspace_name) dataset = Dataset.get_by_name(workspace, name='MSFT') dataset = dataset.to_pandas_dataframe() print(dataset.head()) print(dataset.dtypes)
def main(): # Add arguments to script parser = argparse.ArgumentParser() # XGBClassifier args parser.add_argument("--eta", type=float, help="Learning rate for model") parser.add_argument("--max_depth", type=int, help="Depth for trees") parser.add_argument("--min_child_weight", type=int, help="Min child weight for tree") parser.add_argument( "--subsample", type=float, help="Subsample of training set used for each iteration") parser.add_argument("--colsample_bytree", type=float, help="Subsample of columns to use for each iteration") parser.add_argument( "--early_stopping_rounds", type=int, help= "Model will stop iterating if no improvement after set number of rounds" ) parser.add_argument("--eval_metric", type=str, default="auc", help="Metric for evaluation") parser.add_argument( "--scale_pos_weight", type=float, help="Control balance of positive and negative weights") parser.add_argument("--max_delta_step", type=int, help="Conservativeness of update step") parser.add_argument("--num_boost_rounds", type=int, help="Number of estimators ") # SGD args parser.add_argument("--alpha", type=float, default="linear", help="Regularization strength") parser.add_argument("--l1_ratio", type=float, default=1.0, help="l1_ratio in elasticnet penalty") # ExtraTreesClassifier args parser.add_argument("--n_estimators", type=int, help="Number of trees in the forest") parser.add_argument("--min_samples_split", type=float, help="Min number to split a node") parser.add_argument("--min_samples_leaf", type=float, help="Min number of samples at leaf node") parser.add_argument("--max_features", type=float, help="Number of features to consider for split") parser.add_argument("--ccp_alpha", type=float, help="Complexity parameter for pruning") args = parser.parse_args() # Retrieve datasets by name | Create train/val location = Dataset.get_by_name(workspace=workspace, name="cleaned_loan_dataset").download() print(location) train = pd.read_parquet(location[0]) x_train, x_val = train_test_split(train, test_size=0.3, stratify=train.default_status, random_state=20) y_train, y_val = x_train.pop("default_status"), x_val.pop("default_status") # SVM Classifier scaler = StandardScaler() sgd = SGDClassifier(alpha=args.alpha, l1_ratio=args.l1_ratio, penalty="elasticnet", loss="modified_huber", class_weight="balanced", early_stopping=True) sgd_clf = make_pipeline(scaler, sgd) # ExtraTreesClassifier etc_clf = ExtraTreesClassifier(n_estimators=args.n_estimators, min_samples_split=args.min_samples_split, min_samples_leaf=args.min_samples_leaf, max_features=args.max_features, ccp_alpha=args.ccp_alpha, class_weight="balanced") # XGBoost xgb_clf = XGBClassifier(objective="binary:logistic", n_estimators=args.num_boost_rounds, max_depth=args.max_depth, min_child_weight=args.min_child_weight, learning_rate=args.eta, subsample=args.subsample, colsample_bytree=args.colsample_bytree, eval_metric=args.eval_metric, scale_pos_weight=args.scale_pos_weight, max_delta_step=args.max_delta_step) # VotingClassifier model = VotingClassifier(estimators=[("sgd", sgd_clf), ("etc", etc_clf), ("xgb", xgb_clf)], voting="soft") model.fit(x_train, y_train) # Make prediction on Val dataset & log AUC y_pred = model.predict(x_val) auc_score = roc_auc_score(y_val, y_pred, average="weighted") run.log("auc", float(auc_score)) print("Classification Report: \n", classification_report(y_val, y_pred)) # Dump model artifact os.makedirs('outputs', exist_ok=True) joblib.dump(model, "outputs/model_voting.joblib")
dest="target_column_name", help="Target Column Name", ) parser.add_argument( "--test_dataset", type=str, dest="test_dataset", help="Test Dataset" ) args = parser.parse_args() target_column_name = args.target_column_name test_dataset_id = args.test_dataset run = Run.get_context() ws = run.experiment.workspace # get the input dataset by id test_dataset = Dataset.get_by_id(ws, id=test_dataset_id) X_test = ( test_dataset.drop_columns(columns=[target_column_name]) .to_pandas_dataframe() .reset_index(drop=True) ) y_test_df = ( test_dataset.with_timestamp_columns(None) .keep_columns(columns=[target_column_name]) .to_pandas_dataframe() ) # generate forecast fitted_model = joblib.load("model.pkl") # We have default quantiles values set as below(95th percentile)
from azureml.core import Workspace, Dataset, Experiment, Run from azureml.core.compute import AmlCompute, ComputeTarget from azureml.core.compute_target import ComputeTargetException from azureml.train.dnn import TensorFlow from azureml.widgets import RunDetails import os from utils import get_workspace ws = get_workspace() cluster_name = "bbacompute" dataset_name = "bearing_dataset" dataset = Dataset.get_by_name(ws, dataset_name) try: cluster = ComputeTarget(workspace=ws, name=cluster_name) print("cluster exist: ", cluster_name) except ComputeTargetException: compute_config = AmlCompute.provisioning_configuration(vm_size="standard_d12_v2", max_nodes=1) cluster = ComputeTarget.create(ws, cluster_name, compute_config) cluster.wait_for_completion(show_output=True) exp_name = "exp_bearing_anomaly_lstm" experiment = Experiment(ws, name=exp_name) estimator = TensorFlow( source_directory='.', entry_script='lstm.py', script_params={'--run_at': 'remote'}, inputs=[dataset.as_named_input('bearingdata')], compute_target=cluster,
def main(): print("Running train_aml.py") parser = argparse.ArgumentParser("train") parser.add_argument( "--model_name", type=str, help="Name of the Model", default="insure_model_model.pkl", ) parser.add_argument("--step_output", type=str, help=("output for passing data to next step")) parser.add_argument("--dataset_version", type=str, help=("dataset version")) parser.add_argument("--data_file_path", type=str, help=("data file path, if specified,\ a new version of the dataset will be registered")) parser.add_argument( "--caller_run_id", type=str, help=("caller run id, for example ADF pipeline run id")) parser.add_argument("--dataset_name", type=str, help=("Dataset name. Dataset must be passed by name\ to always get the desired dataset version\ rather than the one used while the pipeline creation")) args = parser.parse_args() print("Argument [model_name]: %s" % args.model_name) print("Argument [step_output]: %s" % args.step_output) print("Argument [dataset_version]: %s" % args.dataset_version) print("Argument [data_file_path]: %s" % args.data_file_path) print("Argument [caller_run_id]: %s" % args.caller_run_id) print("Argument [dataset_name]: %s" % args.dataset_name) model_name = args.model_name step_output_path = args.step_output dataset_version = args.dataset_version data_file_path = args.data_file_path dataset_name = args.dataset_name run = Run.get_context() print("Getting training parameters") # Load the training parameters from the parameters file with open("parameters.json") as f: pars = json.load(f) try: train_args = pars["training"] except KeyError: print("Could not load training values from file") train_args = {} # Log the training parameters print(f"Parameters: {train_args}") for (k, v) in train_args.items(): run.log(k, v) run.parent.log(k, v) # Get the dataset if (dataset_name): if (data_file_path == 'none'): dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, dataset_version) # NOQA: E402, E501 else: dataset = register_dataset(run.experiment.workspace, dataset_name, os.environ.get("DATASTORE_NAME"), data_file_path) else: e = ("No dataset provided") print(e) raise Exception(e) # Link dataset to the step run so it is trackable in the UI run.input_datasets['training_data'] = dataset run.parent.tag("dataset_id", value=dataset.id) # Split the data into test/train df = dataset.to_pandas_dataframe() data = split_data(df) # Train the model model = train_model(data, train_args) # Evaluate and log the metrics returned from the train function metrics = get_model_metrics(model, data[1]) for (k, v) in metrics.items(): run.log(k, v) run.parent.log(k, v) # Pass model file to next step os.makedirs(step_output_path, exist_ok=True) model_output_path = os.path.join(step_output_path, model_name) joblib.dump(value=model, filename=model_output_path) # Also upload model file to run outputs for history os.makedirs('outputs', exist_ok=True) output_path = os.path.join('outputs', model_name) joblib.dump(value=model, filename=output_path) run.tag("run_type", value="train") print(f"tags now present for run: {run.tags}") run.complete()
from sklearn.metrics import accuracy_score from azureml.core import Workspace, Datastore, Dataset, Run from azureml.core.resource_configuration import ResourceConfiguration from azureml.core.model import Model import pandas as pd import os import joblib import sklearn # get the current run run = Run.get_context() ws = run.experiment.workspace datastore = ws.get_default_datastore() # get the dataset ds = Dataset.get_by_name(ws, "diabetes_cleaned") diabetes_df = ds.to_pandas_dataframe() X = diabetes_df.drop("class", axis=1) y = diabetes_df["class"] # split the dataset X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=diabetes_df["class"], random_state=0) # init the model model = KNeighborsClassifier() # train the model model.fit(X_train, y_train)
overwrite=True, show_progress=False) # Register dataset path_on_datastore = os.path.join(target_path, file_name) dataset = Dataset.Tabular.from_delimited_files(path=(default_ds, path_on_datastore)) dataset = dataset.register(workspace=aml_workspace, name=dataset_name, description='diabetes training data', tags={'format': 'CSV'}, create_new_version=True) #Get the dataset dataset = Dataset.get_by_name(aml_workspace, dataset_name) #Create a PipelineData to pass data between steps pipeline_data = PipelineData('pipeline_data', datastore=aml_workspace.get_default_datastore()) #Configure step for training model train_model = PythonScriptStep( name="Train Model", script_name=variables["TRAIN_SCRIPT_PATH"], compute_target=compute_target, runconfig=run_config, inputs=[dataset.as_named_input('training_data')], outputs=[pipeline_data], allow_reuse=False, arguments=["--step_output", pipeline_data])
args = parser.parse_args() target_column_name = args.target_column_name model_name = args.model_name print("args passed are: ") print("Target column name: ", target_column_name) print("Name of registered model: ", model_name) model_path = Model.get_model_path(model_name) # deserialize the model file back into a sklearn model model = joblib.load(model_path) run = Run.get_context() test_dataset = Dataset.get_by_id(run.experiment.workspace, id=args.input_data) X_test_df = test_dataset.drop_columns( columns=[target_column_name]).to_pandas_dataframe() y_test_df = (test_dataset.with_timestamp_columns(None).keep_columns( columns=[target_column_name]).to_pandas_dataframe()) predicted = model.predict_proba(X_test_df) if isinstance(predicted, pd.DataFrame): predicted = predicted.values # Use the AutoML scoring module train_labels = model.classes_ class_labels = np.unique( np.concatenate((y_test_df.values, np.reshape(train_labels, (-1, 1)))))
from azureml.core import Dataset, Run from load_data import train_test_bert, prep_train_test_bert import random random.seed(4) run = Run.get_context() workspace = run.experiment.workspace dataset = Dataset.get_by_name(workspace=workspace, name='dataset') dataset.download(target_path='.', overwrite=False) dist = Dataset.get_by_name(workspace=workspace, name='dataset_dist') dist.download(target_path='.', overwrite=False) def bert_precompute(): prep_train_test_bert('./media.csv', './dist.dat', './models/1024dRoBertAModel', 10, result_path='./result1024dRoBertA.txt', check=1, pretrained_weights='roberta-base') def train_mlp(): os.makedirs(os.path.dirname('./outputs/'), exist_ok=True) precalced = Dataset.get_by_name(workspace, name='distilbert-base-uncased_pack') precalced.download(target_path='./outputs/', overwrite=False) train_test_bert('./media.csv', './dist.dat', './models/768dBertModel', 10, result_path='./result768dBert.txt', check=1, pretrained_weights='distilbert-base-uncased') train_mlp()
parser.add_argument('--learning_rate', type=float, dest='learning_rate', default=1e-5) parser.add_argument('--adam_epsilon', type=float, dest='adam_epsilon', default=1e-8) parser.add_argument('--num_epochs', type=int, dest='num_epochs', default=5) args = parser.parse_args() dataset_name = args.dataset_name batch_size = args.batch_size learning_rate = args.learning_rate adam_epsilon = args.adam_epsilon num_epochs = args.num_epochs run = Run.get_context() workspace = run.experiment.workspace dataset = Dataset.get_by_name(workspace, name=dataset_name) file_name = dataset.download()[0] df = pd.read_csv(file_name) label_counts = pd.DataFrame(df['Product'].value_counts()) label_values = list(label_counts.index) order = list(pd.DataFrame(df['Product_Label'].value_counts()).index) label_values = [l for _,l in sorted(zip(order, label_values))] texts = df['Complaint'].values labels = df['Product_Label'].values tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True) text_ids = [tokenizer.encode(text, max_length=300, pad_to_max_length=True) for text in texts]
from sklearn.externals import joblib from sklearn.impute import SimpleImputer from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler,OneHotEncoder from sklearn_pandas import DataFrameMapper import os import pandas as pd import shutil from azureml.core import Run, Dataset, Workspace ws = Run.get_context().experiment.workspace os.makedirs('./outputs', exist_ok=True) attritionData = Dataset.get_by_name(ws,'employeeattrition').to_pandas_dataframe() # Dropping Employee count as all values are 1 and hence attrition is independent of this feature attritionData = attritionData.drop(['EmployeeCount'], axis=1) # Dropping Employee Number since it is merely an identifier attritionData = attritionData.drop(['EmployeeNumber'], axis=1) attritionData = attritionData.drop(['Over18'], axis=1) # Since all values are 80 attritionData = attritionData.drop(['StandardHours'], axis=1) attritionData["Attrition_numerical"] = attritionData["Attrition"] target = attritionData["Attrition_numerical"] attritionXData = attritionData.drop(['Attrition_numerical', 'Attrition'], axis=1) # Creating dummy columns for each categorical feature
def parse_id_to_dataset(dataset_id): run = Run.get_context() ws = run.experiment.workspace return Dataset.get_by_id(ws, id=dataset_id)
from sklearn.metrics import mean_squared_error from sklearn.metrics import r2_score #from statsmodels.tsa.seasonal import seasonal_decompose from statsmodels.tsa.stattools import adfuller #from sklearn.model_selection import TimeSeriesSplit #from statsmodels.graphics.gofplots import qqplot from statsmodels.tsa.arima_model import ARIMA from azureml.core import Dataset, Run run = Run.get_context() # get input dataset by name #dataset = run.input_datasets['transaction_ts'] ws = run.experiment.workspace dataset1 = Dataset.get_by_name(workspace=ws, name='transaction_ts2013') df = dataset1.to_pandas_dataframe() df.set_index('TransactionDate', inplace=True) df.columns = ['PaidAmount'] series = pd.Series(df['PaidAmount']) def mean_and_variance(X): split = int(len(X) / 2) X1, X2 = X[0:split], X[split:] mean1, mean2 = X1.mean(), X2.mean() var1, var2 = X1.var(), X2.var() print('mean1=%f, mean2=%f' % (mean1, mean2)) print('variance1=%f, variance2=%f' % (var1, var2))