def is_stale(self): """Return boolean to describe whether the computed profile is stale or not. A Profile is considered to be stale if there is changed in underlying data after the profile is computed. - if the data source change cannot be detected, TypeError is raised. - if the data source was changed after submitting the profile run, the flag will be True; - otherwise, the profile matches current data, and the flag will be False. :return: boolean to describe whether the computed profile is stale or not. :rtype: bool """ from azureml.core import Dataset dataset = Dataset.get_by_id(self._workspace, id=self._saved_dataset_id) workspace = dataset._ensure_workspace(self._workspace) request_dto = ActionRequestDto( action_type=_ACTION_TYPE_PROFILE, saved_dataset_id=dataset._ensure_saved(workspace), arguments={'generate_preview': 'True', 'row_count': '1000'}) action_result_dto = _restclient(workspace).dataset.get_action_result( workspace.subscription_id, workspace.resource_group, workspace.name, dataset_id=_LEGACY_DATASET_ID, request=request_dto, custom_headers=_custom_headers) if action_result_dto.is_up_to_date is None: raise AzureMLException(action_result_dto.is_up_to_date_error) return not action_result_dto.is_up_to_date
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument('--n_estimators', type=int, default=100, help="Number of trees in the forest.") parser.add_argument('--max_depth', type=int, default=None, help="Maximum depth of tree.") parser.add_argument('--input_data', type=str) args = parser.parse_args() dataset = Dataset.get_by_id(ws, id=args.input_data) # Drop NAs and encode data. x, y = process_data(dataset) #Split data into train and test sets. x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.33) run.log("Number of Estimators:", np.int(args.n_estimators)) run.log("Max iterations:", np.int(args.max_depth)) model = RandomForestClassifier(n_estimators=args.n_estimators, max_depth=args.max_depth).fit(x_train, y_train) roc_auc = roc_auc_score(model.predict(x_test), y_test) run.log("auc", np.float(roc_auc)) os.makedirs('outputs', exist_ok=True) joblib.dump(value=model, filename='outputs/model.pkl')
def update_output_lineage(workspace, output_datasets): if not output_datasets: return for value in output_datasets: value['dataset'] = Dataset.get_by_id(workspace, value['identifier']['savedId'])
def find_set(set_id): """ Find a dataset in the inputs by its ID, required for Tabular Datasets. :param set_id: ID of the dataset to load. :returns: The found dataset. """ return Dataset.get_by_id(Run.get_context().experiment.workspace, set_id)
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument("--input-data", type=str) parser.add_argument('--n_estimators', type=int, default=100, help="The number of boosting stages to perform") parser.add_argument( '--learning_rate', type=float, default=0.1, help= "Learning rate shrinks the contribution of each tree by learning_rate") args = parser.parse_args() run = Run.get_context() # Get the dataset from run inputs #ds = run.input_datasets['dataset'] ws = run.experiment.workspace # get the input dataset by ID ds = Dataset.get_by_id(ws, id=args.input_data) df = ds.to_pandas_dataframe() erp_mean = df['ERP'].mean() x, y = split_train_label_data(df) # Split data into train and test sets: 20% of the dataset to include in the test split. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) run.log("n_estimators:", np.int(args.n_estimators)) run.log("learning_rate:", np.float(args.learning_rate)) model = GradientBoostingRegressor(n_estimators=args.n_estimators, learning_rate=args.learning_rate, max_depth=1, random_state=0, loss='huber').fit(x_train, y_train) # MSE mse = mean_squared_error(y_test, model.predict(x_test)) # normalized_root_mean_squared_error => to be comparabe with Azure results metric = math.sqrt(mse) / erp_mean run.log("normalized_root_mean_squared_error", np.float(metric)) # Metric reported is 'r2_score' => metric to optimize run.log("r2_score", np.float(model.score(x_test, y_test))) os.makedirs('outputs', exist_ok=True) # Save the model into run history joblib.dump(model, 'outputs/model.joblib')
def get_data(dataset_id): '''Get the dataset and perform general preparation.''' run = Run.get_context() ws = run.experiment.workspace # pylint: disable=invalid-name dataset = Dataset.get_by_id(ws, id=dataset_id) df = dataset.to_pandas_dataframe() # pylint: disable=invalid-name # Keep only text content train_docs = df.iloc[:, 2].values # Remove empty elements (?) train_docs = [d for d in train_docs if d] return train_docs
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument('--n-estimators', type=int, default=100) parser.add_argument('--max-depth', type=int, default=5) parser.add_argument("--input-data", type=str) args = parser.parse_args() run = Run.get_context() run.log("Num estimators:", np.int(args.n_estimators)) run.log("Max depth:", np.int(args.max_depth)) ws = run.experiment.workspace # get the input dataset by ID dataset = Dataset.get_by_id(ws, id=args.input_data) # load the TabularDataset to pandas DataFrame df = dataset.to_pandas_dataframe() X = df.drop(columns=['drugs_related_stop']) y = df['drugs_related_stop'] x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) model = RandomForestClassifier(n_estimators=args.n_estimators, max_depth=args.max_depth).fit( x_train, y_train) accuracy = model.score(x_test, y_test) run.log("accuracy", np.float(accuracy)) value = { "schema_type": "confusion_matrix", "schema_version": "v1", "data": { "class_labels": ["0", "1"], "matrix": confusion_matrix(y_test, model.predict(x_test)).tolist() } } run.log_confusion_matrix(name='Confusion Matrix', value=value) os.makedirs('outputs', exist_ok=True) # note file saved in the outputs folder is automatically uploaded into experiment record joblib.dump(value=model, filename='outputs/model.pkl')
def register_aml_model( model_path, model_name, model_tags, exp, run_id, dataset_id, build_id: str = 'none', build_uri=None ): try: tagsValue = {"area": "aml_recommender", "run_id": run_id, "experiment_name": exp.name} tagsValue.update(model_tags) if (build_id != 'none'): model_already_registered(model_name, exp, run_id) tagsValue["BuildId"] = build_id if (build_uri is not None): tagsValue["BuildUri"] = build_uri model = AMLModel.register( workspace=exp.workspace, model_name=model_name, model_path=model_path, tags=tagsValue, datasets=[('training data', Dataset.get_by_id(exp.workspace, dataset_id))]) os.chdir("..") print( "Model registered: {} \nModel Description: {} " "\nModel Version: {}".format( model.name, model.description, model.version ) ) except Exception: traceback.print_exc(limit=None, file=None, chain=True) print("Model registration failed") raise
def register_aml_model(run_id, exp, model_tags, model_name, model_path, dataset_id, build_id=None, build_uri=None): try: tags_value = { 'area': 'diabetes_regression', 'run_id': run_id, 'experiment_name': exp.name } tags_value.update(model_tags) if build_id is not None: model_already_registered(model_name, run_id, exp) tags_value['BuildId'] = build_id if build_uri is not None: tags_value['BuildUri'] = build_uri model = Model.register(workspace=exp.workspace, model_path=model_path, tags=tags_value, model_name=model_name, datasets=[ ('training_data', Dataset.get_by_id(exp.workspace, dataset_id)) ]) print( f'{model_name} has been registered,\nmodel description: {model.description},\nmodel version: {model.version}' ) except Exception: traceback.print_exc(limit=None, file=None, chain=True) print('model registration failed!') raise
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument("--input_data", type=str, help="Id of the registered train dataset") parser.add_argument('--n_estimators', type=int, default=100, help="Number of estimators") parser.add_argument('--max_depth', type=int, default=6, help="Maximum depth of the trees") args = parser.parse_args() run.log("Number of estimators:", np.float(args.n_estimators)) run.log("Max depth:", np.int(args.max_depth)) # Create TabularDataset dataset = Dataset.get_by_id(ws, id=args.input_data) X_train, X_test, y_train, y_test = clean_data(dataset) model = XGBClassifier(n_estimators=args.n_estimators, max_depth=args.max_depth).fit(X_train, y_train) #saving the model os.makedirs("outputs", exist_ok=True) filename = 'outputs/model.pkl' pickle.dump(model, open(filename, 'wb')) y_pred = model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) run.log("Accuracy", np.float(accuracy))
args = parser.parse_args() target_column_name = args.target_column_name model_name = args.model_name print("args passed are: ") print("Target column name: ", target_column_name) print("Name of registered model: ", model_name) model_path = Model.get_model_path(model_name) # deserialize the model file back into a sklearn model model = joblib.load(model_path) run = Run.get_context() test_dataset = Dataset.get_by_id(run.experiment.workspace, id=args.input_data) X_test_df = test_dataset.drop_columns( columns=[target_column_name]).to_pandas_dataframe() y_test_df = (test_dataset.with_timestamp_columns(None).keep_columns( columns=[target_column_name]).to_pandas_dataframe()) predicted = model.predict_proba(X_test_df) if isinstance(predicted, pd.DataFrame): predicted = predicted.values # Use the AutoML scoring module train_labels = model.classes_ class_labels = np.unique( np.concatenate((y_test_df.values, np.reshape(train_labels, (-1, 1)))))
def parse_id_to_dataset(dataset_id): run = Run.get_context() ws = run.experiment.workspace return Dataset.get_by_id(ws, id=dataset_id)
env.python.conda_dependencies = packages script_config = ScriptRunConfig(source_directory='my_dir', script='script.py', arguments=['--ds', tab_ds], environment=env) #Script from azureml.core import Run, Dataset parser.add_argument('--ds', type=str, dest='dataset_id') args = parser.parse_args() run = Run.get_context() ws = run.experiment.workspace dataset = Dataset.get_by_id(ws, id=args.dataset_id) data = dataset.to_pandas_dataframe() #use named input env = Environment('my_env') packages = CondaDependencies.create(conda_packages=['pip'], pip_packages=['azureml-defaults', 'azureml-dataprep[pandas]']) env.python.conda_dependencies = packages script_config = ScriptRunConfig(source_directory='my_dir', script='script.py', arguments=['--ds', tab_ds.as_named_input('my_dataset')], environment=env)
import argparse from azureml.core import Dataset, Run parser = argparse.ArgumentParser() parser.add_argument("--input-data", type=str) args = parser.parse_args() run = Run.get_context() ws = run.experiment.workspace # get the input dataset by ID dataset = Dataset.get_by_id(ws, id=args.input_data) # load the TabularDataset to pandas DataFrame df = dataset.to_pandas_dataframe()
parser.add_argument("--useful_columns", type=str, help="useful columns to keep") parser.add_argument("--columns", type=str, help="rename column pattern") args = parser.parse_args() print("Argument 1(raw data id): %s" % args.raw_data) print("Argument 2(columns to keep): %s" % str(args.useful_columns.strip("[]").split(";"))) print("Argument 3(columns renaming mapping): %s" % str(args.columns.strip("{}").split(";"))) print("Argument 4(output cleansed taxi data path): %s" % args.output_cleanse) run = Run.get_context() raw_data = Dataset.get_by_id(run.experiment.workspace, id=args.raw_data) # These functions ensure that null data is removed from the dataset, # which will help increase machine learning model accuracy. useful_columns = [ s.strip().strip("'") for s in args.useful_columns.strip("[]").split(";") ] columns = get_dict(args.columns) new_df = (raw_data.to_pandas_dataframe().dropna(how='all').rename( columns=columns))[useful_columns] new_df.reset_index(inplace=True, drop=True) if not (args.output_cleanse is None):
dest="target_column_name", help="Target Column Name", ) parser.add_argument( "--test_dataset", type=str, dest="test_dataset", help="Test Dataset" ) args = parser.parse_args() target_column_name = args.target_column_name test_dataset_id = args.test_dataset run = Run.get_context() ws = run.experiment.workspace # get the input dataset by id test_dataset = Dataset.get_by_id(ws, id=test_dataset_id) X_test = ( test_dataset.drop_columns(columns=[target_column_name]) .to_pandas_dataframe() .reset_index(drop=True) ) y_test_df = ( test_dataset.with_timestamp_columns(None) .keep_columns(columns=[target_column_name]) .to_pandas_dataframe() ) # generate forecast fitted_model = joblib.load("model.pkl") # We have default quantiles values set as below(95th percentile)
from azureml.core import Run, Dataset from azureml.data._dataset_client import _DatasetClient if __name__ == '__main__': dataset_id = sys.argv[1] action_id = sys.argv[2] saved_dataset_id = sys.argv[3] print('Start execution with action_id = {0}, dataset_id = {1} and ' 'saved_dataset_id = {2}'.format(action_id, dataset_id, saved_dataset_id)) workspace = Run.get_context().experiment.workspace if workspace is None: raise TypeError('Workspace is found to be None') dataflow_json = None if saved_dataset_id: try: dataset = Dataset.get_by_id(workspace, saved_dataset_id) except Exception: errorMsg = 'Failed to get the dataset details by saved dataset id {}'.format( saved_dataset_id) print(errorMsg) raise TypeError(errorMsg) dataflow_json = dataset._dataflow.to_json() _DatasetClient._execute_dataset_action(workspace, dataset_id, action_id, dataflow_json)
args = parser.parse_args() min_df = args.min_df max_df = args.max_df ngram_min = args.ngram_min ngram_max = args.ngram_max n_estimators = args.n_estimators max_depth = args.max_depth min_samples_split = args.min_samples_split min_class_frequency = args.min_class_frequency input_data = args.input_data run = Run.get_context() ws = run.experiment.workspace # get the input dataset by ID dataset = Dataset.get_by_id(ws, id=input_data) run.log("Dataset Version", dataset.version) data_ml = dataset.to_pandas_dataframe() # take only data which KKS has more than 5 relating records data_ml = data_ml.groupby('target').filter(lambda x: len(x) >= min_class_frequency) # drop nulls data_ml = data_ml[["target", "tweet"]].dropna().copy() vec = TfidfVectorizer(analyzer='char_wb', ngram_range=(ngram_min, ngram_max), min_df=min_df, max_df=max_df) X = vec.fit_transform(data_ml['tweet']) y = data_ml["target"] run.log("Number of classes", len(set(y)))
def main(): # Add arguments to script parser = argparse.ArgumentParser(description="hyperparameters of the logistic regression model") parser.add_argument('--test-set', type=str, help="Name of your test set") parser.add_argument('--train-set', type=str, help="Name of your training set") parser.add_argument('--max-depth', type=int, default=3, help="How deep is the tree growing during one round of boosting") parser.add_argument('--min-child-weight', type=int, default=2, help="Minimum sum of weight for all observations in a child. Controls overfitting") parser.add_argument('--gamma', type=float, default=0, help="Gamma corresponds to the minimum loss reduction required to make a split.") parser.add_argument('--subsample', type=float, default=0.9, help="What fraction of samples are randomly sampled per tree.") parser.add_argument('--colsample-bytree', type=float, default=0.8, help="What fraction of feature columns are randomly sampled per tree.") parser.add_argument('--reg-alpha', type=float, default=0.00001, help="L1 regularization of the weights. Increasing the values more strongly prevents " "overfitting.") parser.add_argument('--eta', type=float, default=0.2, help="Learning rate for XGBoost.") parser.add_argument('--seed', type=int, default=42, help="Random seed.") parser.add_argument('--num-iterations', type=int, default=20, help="Number of fitting iterations") args = parser.parse_args() params = { 'eta': args.eta, 'max_depth': args.max_depth, 'min_child_weight': args.min_child_weight, 'gamma': args.gamma, 'subsample': args.subsample, 'colsample_bytree': args.colsample_bytree, 'reg_alpha': args.reg_alpha, 'seed': args.seed, 'objective': 'multi:softmax', 'num_class': 3, } run.log("max depth:", np.int(args.max_depth)) run.log("min_child_weight:", np.float(args.min_child_weight)) run.log("gamma", np.float(args.gamma)) run.log("subsample:", np.float(args.subsample)) run.log("colsample_bytree:", np.float(args.colsample_bytree)) run.log("reg alpha:", np.float(args.reg_alpha)) run.log("learning rate:", np.float(args.eta)) # Load the Training Dataset and the Test Dataset ws = run.experiment.workspace dataset_training = Dataset.get_by_id(ws, id=args.train_set) dataset_test = Dataset.get_by_id(ws, id=args.test_set) run.log("loaded_dataset", str(dataset_test)) train_df = dataset_training.to_pandas_dataframe() test_df = dataset_test.to_pandas_dataframe() # Convert the training and test sets to sparse matrices and then create a xgboost DMatrix for efficient computation x_train = scipy.sparse.csr_matrix(train_df.drop(columns=['norm_rating']).to_numpy()) y_train = list(train_df.norm_rating) x_test = scipy.sparse.csr_matrix(test_df.drop(columns=['norm_rating']).to_numpy()) y_test = list(test_df.norm_rating) dtrain = xgb.DMatrix(x_train, label=y_train) dtest = xgb.DMatrix(x_test, label=y_test) # Train the model watchlist = [(dtest, 'eval'), (dtrain, 'train')] num_round = args.num_iterations model = xgb.train(params, dtrain, num_round, watchlist) # Evaluate the model pred_test = model.predict(dtest) pred_train = model.predict(dtrain) # Compute F1 scores and Accuracy scores f1_score_weighted_train = f1_score(y_train, pred_train, average='weighted') accuracy_train = accuracy_score(y_train, pred_train) f1_score_weighted = f1_score(y_test, pred_test, average='weighted') accuracy = accuracy_score(y_test, pred_test) os.makedirs('outputs', exist_ok=True) # note file saved in the outputs folder is automatically uploaded into experiment record joblib.dump(value=model, filename='outputs/xgboost_model.pkl') run.log("F1ScoreWeightedTrain", np.float(f1_score_weighted_train)) run.log("F1ScoreWeighted", np.float(f1_score_weighted)) run.log("AccuracyTrain", np.float(accuracy_train)) run.log("Accuracy", np.float(accuracy))