def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument( "--C", type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization", ) parser.add_argument( "--max_iter", type=int, default=100, help="Maximum number of iterations to converge", ) args = parser.parse_args() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) ds = TabularDatasetFactory().from_delimited_files( path= "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ) x, y = clean_data(ds) # Split data into train and test sets. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=7) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy)) pred_prob = model.predict_proba(x_test) auc_score = roc_auc_score(y_test, pred_prob[:, 1], average="weighted") run.log("AUC", np.float(auc_score)) # files saved in the "outputs" folder are automatically uploaded into run history os.makedirs("outputs", exist_ok=True) joblib.dump(model, "./outputs/model.joblib")
def main(): # Add arguments to src parser = argparse.ArgumentParser() parser.add_argument( '--C', type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" factory = TabularDatasetFactory() ds = factory.from_delimited_files(path) X, y = clean_data(ds) # Split data into train and test sets. train_data, test_data, train_label, test_label = train_test_split( X, y, test_size=0.3, random_state=42) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit( train_data, train_label) accuracy = model.score(test_data, test_label) run.log("Accuracy", np.float(accuracy)) os.makedirs('outputs', exist_ok=True) # Save model joblib.dump(model, 'outputs/model.joblib')
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization") parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) # TODO: Create TabularDataset using TabularDatasetFactory # Data is located at: # "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ### YOUR CODE HERE ### ## Added code to inport TabularDataset using TabularDatasetFactory Class factory = TabularDatasetFactory() train_data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ds = factory.from_delimited_files(path=train_data_path) # Clean the data x, y = clean_data(ds) # TODO: Split data into train and test sets. ### YOUR CODE HERE ### x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) # Fit model using Logistic Regression with inpur arguments C -> regularization strength # Importance of Regularization in Logistic regression: # https://stackoverflow.com/questions/22851316/what-is-the-inverse-of-regularization-strength-in-logistic-regression-how-shoul # https://www.coursera.org/lecture/machine-learning/regularized-logistic-regression-4BHEy model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) # Download best model using the joblib library os.makedirs('outputs', exist_ok=True) joblib.dump(model, 'outputs/model.joblib') accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy))
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument( '--C', type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) factory = TabularDatasetFactory() train_data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" valid_data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_validate.csv" train_ds = factory.from_delimited_files(train_data_path) valid_ds = factory.from_delimited_files(valid_data_path) X_train, y_train = clean_data(train_ds) X_valid, y_valid = clean_data(valid_ds) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(X_train, y_train) accuracy = model.score(X_valid, y_valid) run.log("Accuracy", np.float(accuracy)) os.makedirs('outputs', exist_ok=True) joblib.dump(model, 'outputs/bankmarketing-logit-model.joblib')
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument('--learning_rate', type=float, default=0.03, help="Learning rate param for lgbm") parser.add_argument('--max_depth', type=int, default=10, help="Limit the tree depth explicitly") parser.add_argument('--num_leaves', type=int, default=255, help="Control the complexity of the tree model") parser.add_argument('--min_data_in_leaf', type=int, default=3, help="Large value can avoid growing too deep a tree") parser.add_argument('--num_iterations', type=int, default=500, help="Number of boosting iterations") args = parser.parse_args() run.log("learning-rate:", np.float(args.learning_rate)) run.log("max_depth:", np.int(args.max_depth)) run.log("num_leaves", np.int(args.num_leaves)) run.log("min_data_in_leaf", np.int(args.min_data_in_leaf)) run.log("num_iterations", np.int(args.num_iterations)) factory = TabularDatasetFactory() train_data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" valid_data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_validate.csv" train_ds = factory.from_delimited_files(train_data_path) valid_ds = factory.from_delimited_files(valid_data_path) X_train, y_train = clean_data(train_ds) X_valid, y_valid = clean_data(valid_ds) d_train = lgbm.Dataset(X_train, label=y_train) lgbm_params = {} lgbm_params['learning_rate'] = args.learning_rate lgbm_params['boosting_type'] = 'gbdt' lgbm_params['objective'] = 'binary' lgbm_params['metric'] = 'binary_logloss' lgbm_params['max_depth'] = args.max_depth lgbm_params['num_leaves'] = args.num_leaves lgbm_params['min_data_in_leaf'] = args.min_data_in_leaf lgbm_params['colsample_bytree'] = 1.0, model = lgbm.train(lgbm_params, d_train, args.num_iterations) accuracy = accuracy_score( model.predict(X_valid).round(0).astype(int), y_valid) run.log("Accuracy", np.float(accuracy)) os.makedirs('outputs', exist_ok=True) joblib.dump(model, 'outputs/bankmarketing-lgbm-model.joblib')
import os import numpy as np from sklearn.metrics import mean_squared_error import joblib from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder import pandas as pd from azureml.core.run import Run from azureml.data.dataset_factory import TabularDatasetFactory # TODO: Create TabularDataset using TabularDatasetFactory # Data is located at: # "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ### YOUR CODE HERE ### ds = TabularDatasetFactory() \ .from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv") def clean_data(data): # Dict for cleaning data months = { "jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, "jul": 7, "aug": 8, "sep": 9, "oct": 10,