Exemple #1
0
def reactivate_dataset(workspace=None,
                       dataset_name=None,
                       dataset_id=None,
                       logger=None):
    if _check_python() is False:
        raise UserErrorException(
            'The dataset command subgroup is only supported with Python 3.5 or more'
        )
    dataset = Dataset.get(workspace, dataset_name, dataset_id)
    dataset_state = dataset.state
    if dataset_state == 'active':
        raise UserErrorException("Dataset '{}' ({}) is already active".format(
            dataset.name, dataset.id))
    dataset.reactivate()
    dataset = Dataset.get(workspace, name=dataset.name)
    if dataset.state == 'active':
        logger.info("Dataset '{}' ({}) was reactivated successfully".format(
            dataset.name, dataset.id))
        return dataset._get_base_info_dict_show()
    else:
        logger.debug(
            "dataset reactivate error. name: {} id: {} state: {}".format(
                dataset.name, dataset.id, dataset.state))
        raise Exception("Error, Dataset '{}' ({}) was not reactivated".format(
            dataset.name, dataset.id))
def init():
    global g_tf_sess, probabilities, label_dict, input_images

    parser = argparse.ArgumentParser(description="Start a tensorflow model serving")
    parser.add_argument('--model_name', dest="model_name", required=True)
    parser.add_argument('--labels_name', dest="labels_name", required=True)
    args, _ = parser.parse_known_args()

    workspace = Run.get_context(allow_offline=False).experiment.workspace
    label_ds = Dataset.get_by_name(workspace=workspace, name=args.labels_name)
    label_ds.download(target_path='.', overwrite=True)

    label_dict = get_class_label_dict()
    classes_num = len(label_dict)

    with slim.arg_scope(inception_v3.inception_v3_arg_scope()):
        input_images = tf.placeholder(tf.float32, [1, image_size, image_size, num_channel])
        logits, _ = inception_v3.inception_v3(input_images,
                                              num_classes=classes_num,
                                              is_training=False)
        probabilities = tf.argmax(logits, 1)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    g_tf_sess = tf.Session(config=config)
    g_tf_sess.run(tf.global_variables_initializer())
    g_tf_sess.run(tf.local_variables_initializer())

    model_path = Model.get_model_path(args.model_name)
    saver = tf.train.Saver()
    saver.restore(g_tf_sess, model_path)
Exemple #3
0
 def download_file(self,
                   dataset_name,
                   dataset_type='pandas',
                   *args,
                   **kwargs):
     """
     Downloads file from file storage
     :param dataset_name: name of the dataset
     :param dataset_type: name of the dataset type
     :param args: other arguments containing additional information
     :param kwargs: other keyword arguments containing additional information
     """
     try:
         run = Run.get_context(allow_offline=False)
     except RunEnvironmentException as e:
         raise SkipServiceException('Skip AmlModelStorageHandler handler')
     ws = run.experiment.workspace
     ds = Dataset.get_by_name(workspace=ws,
                              name=dataset_name)  # Get a Dataset by name
     if dataset_type == 'spark':
         df = ds.to_spark_dataframe(
         )  # Load a Tabular Dataset into pandas DataFrame
     else:
         df = ds.to_pandas_dataframe(
         )  # Load a Tabular Dataset into pandas DataFrame
     return df
Exemple #4
0
    def upload_file(self,
                    dataset: pd.DataFrame,
                    dataset_name: str,
                    dataset_type: str = 'pandas',
                    *args,
                    **kwargs):
        """
        Uploads file to file storage
        :param dataset: dataset object
        :param dataset_name: name of the dataset
        :param dataset_type: name of the dataset type
        :param args: other arguments containing additional information
        :param kwargs: other keyword arguments containing additional information
        """
        run = Run.get_context()
        ws = run.experiment.workspace
        if dataset_type == 'pandas':
            ds = Dataset.from_pandas_dataframe(
                dataframe=dataset
            )  # Load a Tabular Dataset into pandas DataFrame
        else:
            raise Exception(
                "ERROR: only pandas DataFrame is supported ath the moment.")

        ds.register(ws, name=dataset_name)
Exemple #5
0
def main():
    parser = argparse.ArgumentParser()
    #params = {"objective": "binary:logistic", "max_depth": 3}
    run = Run.get_context()

    parser.add_argument('--num_boost_round', type=int, default=5, help="Number of boosting rounds")
    parser.add_argument('--max_depth', type=int, default=3, help="Maximum depth of the trees to be boosted")
    parser.add_argument('--learning_rate', type=float, default=0.001, help="Learning rate, xgb's eta")
    parser.add_argument('--gamma', type=float, default=0.1, help="Minimum loss reduction")
    parser.add_argument('--reg_lambda', type=float, default=0.1, help="L2 regularization term on weights")
    parser.add_argument('--scale_pos_weight', type=float, default=1.0, help="Balancing of positive and negative weights")

    args = parser.parse_args()

    workspace = run.experiment.workspace

    key = "heart-failure" #"Heart failure"
    description_text = "Heart failure dataset for udacity capstone"

    if key in workspace.datasets.keys():
        found = True
        dataset = Dataset.get_by_name(workspace, name='heart-failure')
        
    df = dataset.to_pandas_dataframe()
    X, y = df.loc[:, ~df.columns.isin(["DEATH_EVENT"])], df.loc[:, "DEATH_EVENT"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)


    params = {"scale_pos_weight": np.float(args.scale_pos_weight),
              "reg_lambda": np.float(args.reg_lambda),
              "gamma": np.float(args.gamma),
              "learning_rate": np.float(args.learning_rate),
              "max_depth": np.int(args.max_depth),
              "num_boost_round": np.int(args.num_boost_round)}

    run.log("scale_pos_weigth:", np.float(args.scale_pos_weight))
    run.log("Max depth:", np.int(args.max_depth))
    run.log("Learning rate:", np.int(args.learning_rate))
    run.log("Boosting rounds:", np.int(args.num_boost_round))
    run.log("Gamma (minimum loss reduction):", np.int(args.learning_rate))
    run.log("Lambda (L2 regularization):", np.int(args.reg_lambda))

    xgb_model = xgb.XGBClassifier(objective="binary:logistic",
                        n_estimators = np.int(args.num_boost_round),
                        max_depth = np.int(args.max_depth),
                        learning_rate=np.float(args.learning_rate),
                        gamma= np.float(args.gamma),
                        reg_lambda=np.float(args.reg_lambda),
                        scale_pos_weight=np.float(args.scale_pos_weight),
                        random_state=123)
    xgb_model.fit(X_train, y_train)
    y_pred = xgb_model.predict(X_test)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
    xgb_auc = auc(fpr, tpr)

    run.log("AUC", np.float(xgb_auc))
    os.makedirs('outputs', exist_ok=True)
    joblib.dump(value=xgb_model, filename='outputs/model.pkl')
Exemple #6
0
def unregister_dataset(workspace=None, dataset_name=None, logger=None):
    try:
        dataset = Dataset.get_by_name(workspace, dataset_name)
    except:
        print('There is no dataset registered with name "{}".'.format(
            dataset_name))
        return
    dataset.unregister_all_versions()
    print('Successfully unregistered datasets with name "{}".'.format(
        dataset_name))
def RunAutoML():
        subscription_id = request.json['subscription_id']
        resource_group = request.json['resource_group']
        workspace_name = request.json['workspace_name']
        file_name = request.json['file_name']
        #location = request.json['location']
    
        ws = Workspace(subscription_id=subscription_id,
                                  resource_group=resource_group,
                                  workspace_name=workspace_name)
                                            
        print("Found workspace {} at location {}".format(ws.name, ws.location))
        print('Found existing Workspace.')
            
        dataset_name = file_name

        # Get a dataset by name
        df = Dataset.get_by_name(workspace=ws, name=dataset_name)
        stock_dataset_df = df.to_pandas_dataframe()
        print('file successfully recieved.')
        stock_dataset_df.head()
        #stock_dataset_json = stock_dataset_df.to_json(orient='split')
        #print(stock_dataset_json)
        y_df = stock_dataset_df['ActionTaken'].values
        x_df = stock_dataset_df.drop(['ActionTaken'], axis=1)
        
        ExperimentName = request.json['ExperimentName']       
        tasks = request.json['tasks']
        iterations = request.json['iterations']
        iteration_timeout_minutes = request.json['iteration_timeout_minutes']
        primary_metric = request.json['primary_metric']
        
        #n_cross_validations = request.json['n_cross_validations']
        
        try:
            automl_config = AutoMLConfig(
                task=tasks,
                X=x_df,
                y=y_df,
                iterations=iterations,
                iteration_timeout_minutes=iteration_timeout_minutes,
                primary_metric=primary_metric,
                #n_cross_validations=n_cross_validations,
                preprocess=True,
                )
            experiment = Experiment(ws, ExperimentName)
            run = experiment.submit(config=automl_config, show_output=True)
    
            best_model,fitted_model = run.get_output()

            return 'ok'
        except:

            return 'error'
Exemple #8
0
def get_df_from_dataset(dataset_path, dataset_name, dataset_is_remote=False):
    """
    Return a DataFrame by reading the dataset from either a
    directory containing csv files or Azure Dataset
    """
    if dataset_is_remote:
        workspace = package_utils.get_workspace()
        df = Dataset.get_by_name(workspace=workspace,
                                 name=dataset_name).to_pandas_dataframe()
    else:
        df = get_df_from_directory(pathlib.Path(dataset_path, dataset_name))
    return df
Exemple #9
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--n_estimators',
                        type=int,
                        default=50,
                        help="Number of trees")
    parser.add_argument('--max_depth',
                        type=int,
                        default=3,
                        help="Maximum depth of the trees to used")
    parser.add_argument('--min_samples_split',
                        type=int,
                        default=2,
                        help="Minimum samples")

    run = Run.get_context()
    workspace = run.experiment.workspace

    key = "heart-failure"  #"Heart failure"
    description_text = "Heart failure dataset for udacity capstone"

    if key in workspace.datasets.keys():
        found = True
        dataset = Dataset.get_by_name(workspace, name='heart-failure')

    df = dataset.to_pandas_dataframe()
    X, y = df.loc[:, ~df.columns.isin(["DEATH_EVENT"])], df.loc[:,
                                                                "DEATH_EVENT"]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=123)

    args = parser.parse_args()

    run.log("n_estimators:", np.int(args.n_estimators))
    run.log("max_depth:", np.int(args.max_depth))
    run.log("min_samples_split:", np.int(args.min_samples_split))

    rf_model = RandomForestClassifier(n_estimators=np.int(args.n_estimators),
                                      max_depth=np.int(args.max_depth),
                                      min_samples_split=np.int(
                                          args.min_samples_split),
                                      random_state=123)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
    rf_auc = auc(fpr, tpr)

    run.log("AUC", np.float(rf_auc))
    os.makedirs('outputs', exist_ok=True)
    joblib.dump(value=rf_model, filename='outputs/model.pkl')
Exemple #10
0
def get_dataset(workspace=None,
                dataset_name=None,
                dataset_version=None,
                dataset_id=None,
                logger=None):
    if dataset_name is None and dataset_id is None:
        raise UserErrorException('Argument {} or {} must be specified'.format(
            DATASET_NAME.long_form, DATASET_ID.long_form))
    if dataset_name is not None and dataset_id is not None:
        raise UserErrorException(
            'Arguments {} and {} cannot be specified at the same time'.format(
                DATASET_NAME.long_form, DATASET_ID.long_form))
    if dataset_version != DATASET_VERSION.default and dataset_name is None:
        raise UserErrorException(
            'Argument {} must be specified with {}'.format(
                DATASET_VERSION.long_form, DATASET_NAME.long_form))
    dataset_version = dataset_version or DATASET_VERSION.default
    if dataset_name is not None:
        dataset = Dataset.get_by_name(workspace, dataset_name, dataset_version)
    else:
        dataset = Dataset.get_by_id(workspace, dataset_id)
    return _dataset_to_printable(dataset)
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--C',
        type=float,
        default=1.0,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization"
    )
    parser.add_argument('--max_iter',
                        type=int,
                        default=100,
                        help="Maximum number of iterations to converge")

    args = parser.parse_args()

    run = Run.get_context()
    workspace = run.experiment.workspace

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    #The dataset is registered using Python SDK in the notebook
    dataset_name = 'Framingham-Prepared'

    # Get a dataset by name
    ds = Dataset.get_by_name(workspace=workspace, name=dataset_name)
    x, y = clean_data(ds)

    # TODO: Split data into train and test sets.

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=223)

    model = LogisticRegression(C=args.C,
                               max_iter=args.max_iter).fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)

    run.log("Accuracy", np.float(accuracy))
    #save the best model
    os.makedirs('outputs', exist_ok=True)

    joblib.dump(value=model, filename='outputs/model.joblib')
Exemple #12
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--kernel', type=str, default='linear',
                        help='Kernel type to be used in the algorithm')
    parser.add_argument('--penalty', type=float, default=1.0,
                        help='Penalty parameter of the error term')

    args = parser.parse_args()
    run.log('Kernel type', np.str(args.kernel))
    run.log('Penalty', np.float(args.penalty))


    heart_dataset = Dataset.get_by_name(workspace=ws, name='Heart-Failure')
    df = heart_dataset.to_pandas_dataframe()
    

    y = df[df.columns[-1]]
    X = df.drop(df.columns[-1],axis=1)
    
    scaler = StandardScaler()
    scaled_X = scaler.fit_transform(X)

    # dividing X, y into train and test data
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, random_state=0)

    # training a linear SVM classifier
    from sklearn.svm import SVC
    svm_model_linear = SVC(kernel=args.kernel, C=args.penalty).fit(X_train, y_train)
    svm_predictions = svm_model_linear.predict(X_test)

    # model accuracy for X_test
    accuracy = accuracy_score(svm_predictions, y_test)

    #print('Accuracy of SVM classifier on test set: {:.2f}'.format(accuracy))
    run.log('Accuracy', np.float(accuracy))
    

    os.makedirs('outputs', exist_ok=True)
    # files saved in the "outputs" folder are automatically uploaded into run history
    joblib.dump(svm_model_linear, 'outputs/model.joblib')
def DataBlob():
    subscription_id = request.json['subscription_id']
    resource_group = request.json['resource_group']
    workspace_name = request.json['workspace_name']
    file_name = request.json['file_name']
    #location = request.json['location']

    ws = Workspace(subscription_id=subscription_id,
                   resource_group=resource_group,
                   workspace_name=workspace_name)

    print("Found workspace {} at location {}".format(ws.name, ws.location))
    print('Found existing Workspace.')
    ds = ws.get_default_datastore()
    print(ds.datastore_type, ds.account_name, ds.container_name)
    try:
        stock_ds = Dataset.Tabular.from_delimited_files(
            path=ds.path(file_name))
        stock_ds = stock_ds.register(workspace=ws,
                                     name=file_name,
                                     description='stock training data')
        print('Found existing file name')
        return "This file name exist. Please rename or upload new file"
    except:
        print('Uploading new file, please wait')

    stock_dataset = Dataset.Tabular.from_delimited_files(
        path=ds.path(file_name))
    stock_dataset = stock_dataset.register(workspace=ws,
                                           name=file_name,
                                           description='stock training data')
    #file_name = json.loads(file_name)
    print(type(file_name))
    new_data = Dataset.get_by_name(ws, file_name, version='latest')
    print(new_data.name)
    print(type(new_data.name))
    stock_dataset_df = eval(new_data.name).to_pandas_dataframe()
    print('file successfully recieved.')
    stock_dataset_json = stock_dataset_df.to_json(orient='split')
    return stock_dataset_json
Exemple #14
0
def main():

    print(azureml.core.VERSION)

    dataset_name = getRuntimeArgs()

    run = Run.get_context()
    ws = run.experiment.workspace

    ds = Dataset.get_by_name(workspace=ws, name=dataset_name)

    automl_settings = {
        "task": 'classification',
        "verbosity": logging.INFO,
        "primary_metric": 'accuracy',
        "experiment_timeout_hours": 0.05,
        "n_cross_validations": 3,
        "enable_stack_ensemble": False,
        "enable_voting_ensemble": False,
        "model_explainability": True,
        "preprocess": True,
        "max_cores_per_iteration": -1,
        "max_concurrent_iterations": 4,
        "training_data": ds,
        "drop_column_names": ['Sno'],
        "label_column_name": 'Risk'
    }

    automl_config = AutoMLConfig(**automl_settings)
    run = run.submit_child(automl_config, show_output=True)

    best_run, fitted_model = run.get_output()

    output_dir = './outputs/'
    os.makedirs(output_dir, exist_ok=True)
    shutil.copy2('automl.log', output_dir)

    with open(output_dir + 'best_run.json', 'w') as f:
        json.dump(best_run, f)
    def _setup_dataset(self, ds_name, data_paths):
        """
        registers datasets with azureml workspace

        :param str ds_name: [required] name to give the dataset in azureml.
        :param str data_paths: [required] list of paths to your data on the datastore.
        """
        self.named_ds = []
        count = 1
        for data_path in data_paths:
            curr_name = ds_name + str(count)
            path_on_datastore = self.blob_ds.path(data_path)
            input_ds = Dataset.File.from_files(path=path_on_datastore,
                                               validate=False)
            try:
                registered_ds = input_ds.register(workspace=self.ws,
                                                  name=curr_name,
                                                  create_new_version=True)
            except Exception as e:
                n, v = self._parse_exception(e)
                registered_ds = Dataset.get_by_name(self.ws, name=n, version=v)
            self.named_ds.append(registered_ds.as_named_input(curr_name))
            count = count + 1
Exemple #16
0
experiment = Experiment(ws, '<<experiment_name>>')
automl_run = Run(experiment=experiment, run_id='<<run_id>>')

# Check if this AutoML model is explainable
if not automl_check_model_if_explainable(automl_run):
    raise Exception("Model explanations is currently not supported for " +
                    automl_run.get_properties().get('run_algorithm'))

# Download the best model from the artifact store
automl_run.download_file(name=MODEL_PATH, output_file_path='model.pkl')

# Load the AutoML model into memory
fitted_model = joblib.load('model.pkl')

# Get the train dataset from the workspace
train_dataset = Dataset.get_by_name(workspace=ws,
                                    name='<<train_dataset_name>>')
# Drop the lablled column to get the training set.
X_train = train_dataset.drop_columns(columns=['<<target_column_name>>'])
y_train = train_dataset.keep_columns(columns=['<<target_column_name>>'],
                                     validate=True)

# Get the train dataset from the workspace
test_dataset = Dataset.get_by_name(workspace=ws, name='<<test_dataset_name>>')
# Drop the lablled column to get the testing set.
X_test = test_dataset.drop_columns(columns=['<<target_column_name>>'])

# Setup the class for explaining the AtuoML models
automl_explainer_setup_obj = automl_setup_model_explanations(fitted_model,
                                                             '<<task>>',
                                                             X=X_train,
                                                             X_test=X_test,
ds.upload(src_dir=file_path,
          target_path=None,
          overwrite=True,
          show_progress=True)

stock_ds = Dataset.Tabular.from_delimited_files(path=datastore.path(file_name))
stock_ds = stock_ds.register(workspace=ws,
                             name=file_name,
                             description='Introact Owner Data')

compute_target = AmlCompute(ws, cluster_name)
print('Found existing AML compute context.')
dataset_name = file_name

# Get a dataset by name
df = Dataset.get_by_name(workspace=ws, name=dataset_name)

X = df.drop_columns(columns=[target_var])
y = df.keep_columns(columns=[target_var], validate=True)
print(y)
#y = diabetes.pop('Y')
#X_train, X_test, y_train, y_test = train_test_split(diabetes, y, test_size=0.2, random_state=0)
#data = {"train": {"X": X_train, "y": y_train}, "test": {"X": X_test, "y": y_test}}
conda_run_config = RunConfiguration(framework="python")
conda_run_config.environment.docker.enabled = True
conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE
cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'],
                              conda_packages=['numpy', 'py-xgboost<=0.80'])
conda_run_config.environment.python.conda_dependencies = cd
print('run config is ready')
workspace = Workspace.from_config(auth=AzureCliAuthentication())

# Define the conda dependencies
cd = CondaDependencies(conda_dependencies_file_path=os.path.join(
    os.path.dirname(os.path.realpath(__file__)),
    'conda_dependencies_sklearn.yml'))

# define compute
compute_target = '20cpucluster'

# define data set names
input_name_train = 'newsgroups_train'
input_name_test = 'newsgroups_test'

# Retrieve datsets
dataset_train = Dataset.get_by_name(workspace, name=input_name_train)
dataset_test = Dataset.get_by_name(workspace, name=input_name_test)

# Runconfig
amlcompute_run_config = RunConfiguration(
    script="train.py",
    conda_dependencies=cd,
    framework='Python',
)

amlcompute_run_config.environment.docker.enabled = True
amlcompute_run_config.environment.spark.precache_packages = False
amlcompute_run_config.target = compute_target
amlcompute_run_config.data = {
    input_name_train: load_data(dataset_train, input_name_train),
    input_name_test: load_data(dataset_test, input_name_test)
from azureml.core.experiment import Experiment
from azureml.core import Run
experiment = Experiment(ws, 'Myexp2_v1_test21')
best_run = Run(experiment=experiment,
               run_id='AutoML_74e9d9dc-f347-4392-b8bb-3edeb4a6afad_8')
fitted_model = Run(experiment=experiment,
                   run_id='AutoML_74e9d9dc-f347-4392-b8bb-3edeb4a6afad_8')
#print(best_run.register_model()
print(fitted_model)

# Get a dataset by name
from azureml.core.dataset import Dataset

file_name = '2018Q4PredictionTrainedSet101.csv'
stock_dataset = Dataset.get_by_name(ws, '2018Q4PredictionTrainedSet101.csv')
#stock_dataset
#dataset = Dataset.Tabular.from_delimited_files(stock_dataset)
stock_dataset.to_pandas_dataframe().describe()
stock_dataset.take(3).to_pandas_dataframe()

X = stock_dataset.drop_columns(columns=['ActionTaken'])
y = stock_dataset.keep_columns(columns=['ActionTaken'], validate=True)
print(y)
#print('X and y are ready!')
stock_dataset_df = stock_dataset.to_pandas_dataframe()
y_df = stock_dataset_df['ActionTaken'].values
x_df = stock_dataset_df.drop(['ActionTaken'], axis=1)

y_predict = fitted_model.predict(x_df)
print(y_predict)
        deploy_model = True
        print('Current model performs better and will be deployed!')
    else:
        print('Current model does NOT perform better and thus will NOT be deployed!')

eval_info = {}
eval_info["model_acc"] = latest_model_accuracy
eval_info["deployed_model_acc"] = current_model_accuracy
eval_info["deploy_model"] = deploy_model
eval_info['train_run_id'] = latest_model_run_id
eval_info['eval_run_id'] = run.id

if deploy_model:
    os.chdir(args.input)
    cardata_ds_name = 'connected_car_components'
    cardata_ds  = Dataset.get_by_name(workspace=ws, name=cardata_ds_name)
    glove_ds_name = 'glove_6B_100d'
    glove_ds = Dataset.get_by_name(workspace=ws, name=glove_ds_name)
    
    model_description = 'Deep learning model to classify the descriptions of car components as compliant or non-compliant.'
    
    # Create model datasheet
    from datetime import datetime
    from pytz import timezone
    etz = 'US/Eastern'
    time_stamp = datetime.now(timezone(etz))
    time_stamp_str = time_stamp.strftime('%A %m/%d/%Y %I:%M:%S%p')
    model_tags = {}
    model_tags['title'] = 'Connected car components classifier'
    model_tags['datasheet_description'] = 'Data sheet last updated: ' + time_stamp_str
    model_tags['details'] = 'This model was developed for automatically classifying car components as compliant or not compliant. The model leverages deep learning technologies with Natural Language Processing techniques to scan through vehicle specification documents to find compliance issues with new regulations.'
Exemple #21
0
def main(
    workspace=None,
    dataset_trainandvalidate_name=config.get_default_dataset_name(
        "trainandvalidate"),
):
    """
    Return AutoMLConfig
    """

    if not workspace:
        workspace = package_utils.get_workspace()

    args = aml_compute.parse_args()
    cluster_max_nodes = 5
    args.cluster_max_nodes = cluster_max_nodes
    args.cluster_sku = "Standard_D12_v2"
    compute_target = aml_compute.main(args)
    logger.info(msg="main",
                extra={"compute_target": compute_target.serialize()})

    trainandvalidate = Dataset.get_by_name(
        workspace=workspace,
        name=dataset_trainandvalidate_name,
    )

    model_settings = {
        "task": "classification",
        "primary_metric": "norm_macro_recall",
    }

    ensemble_settings = {
        "iterations":
        15,
        "allowed_models":
        ["LightGBM", "LogisticRegression", "SGD", "XGBoostClassifier"],
        "enable_voting_ensemble":
        True,
        "enable_stack_ensemble":
        False,
    }

    dataset_settings = {
        "validation_size": 0.3,
        "featurization": "auto",
        "training_data": trainandvalidate,
        "label_column_name": "Label",
    }

    compute_settings = {
        "compute_target": compute_target,
        "max_cores_per_iteration": -1,
        "max_concurrent_iterations": cluster_max_nodes,
        "experiment_timeout_hours": 1.5,
    }

    automl_config = AutoMLConfig(
        **model_settings,
        **ensemble_settings,
        **dataset_settings,
        **compute_settings,
    )

    return automl_config
def RunAutoML():
    subscription_id = request.json['subscription_id']
    resource_group = request.json['resource_group']
    workspace_name = request.json['workspace_name']
    file_name = request.json['file_name']
    #location = request.json['location']

    ws = Workspace(subscription_id=subscription_id,
                   resource_group=resource_group,
                   workspace_name=workspace_name)

    print("Found workspace {} at location {}".format(ws.name, ws.location))
    print('Found existing Workspace.')

    dataset_name = file_name

    # Get a dataset by name
    df = Dataset.get_by_name(workspace=ws, name=dataset_name)
    stock_dataset_df = df.to_pandas_dataframe()
    print('file successfully recieved.')
    stock_dataset_df.head()
    #stock_dataset_json = stock_dataset_df.to_json(orient='split')
    #print(stock_dataset_json)
    y_df = stock_dataset_df['ActionTaken'].values
    x_df = stock_dataset_df.drop(['ActionTaken'], axis=1)
    print(y_df)
    ExperimentName = request.json['ExperimentName']
    tasks = request.json['tasks']
    iterations = request.json['iterations']
    n_cross_validations = request.json['n_cross_validations']
    iteration_timeout_minutes = request.json['iteration_timeout_minutes']
    primary_metric = request.json['primary_metric']
    max_concurrent_iterations = request.json['max_concurrent_iterations']

    #n_cross_validations = request.json['n_cross_validations']

    try:
        automl_settings = {
            "name": ExperimentName,
            "iteration_timeout_minutes": iteration_timeout_minutes,
            "iterations": iterations,
            "n_cross_validations": n_cross_validations,
            "primary_metric": primary_metric,
            "preprocess": True,
            "max_concurrent_iterations": max_concurrent_iterations,
            "verbosity": logging.INFO
        }

        automl_config = AutoMLConfig(
            task=tasks,
            debug_log='automl_errors.log',
            path=os.getcwd(),
            #compute_target = 'Automlvm',
            X=x_df,
            y=y_df,
            **automl_settings,
        )

        experiment = Experiment(ws, 'automl_local_v2')
        remote_run = experiment.submit(automl_config, show_output=True)
        children = list(remote_run.get_children())
        metricslist = {}
        for run in children:
            properties = run.get_properties()
            metrics = {
                k: v
                for k, v in run.get_metrics().items() if isinstance(v, float)
            }
            metricslist[int(properties['iteration'])] = metrics

        rundata = pd.DataFrame(metricslist).sort_index(1)
        rundata_toJson = rundata.to_json(orient='columns')

        return rundata_toJson
    except:

        return 'error'
def RunAutoML():
    subscription_id = request.json['subscription_id']
    resource_group = request.json['resource_group']
    workspace_name = request.json['workspace_name']
    file_name = request.json['file_name']
    location = request.json['location']
    target_var = request.json['target_var']

    ws = Workspace(subscription_id=subscription_id,
                   resource_group=resource_group,
                   workspace_name=workspace_name)

    print("Found workspace {} at location {}".format(ws.name, ws.location))
    print('Found existing Workspace.')

    dataset_name = file_name

    # Get a dataset by name
    df = Dataset.get_by_name(workspace=ws, name=dataset_name)
    stock_dataset_df = df.to_pandas_dataframe()
    print('file successfully recieved.')
    stock_dataset_df.head()
    #stock_dataset_json = stock_dataset_df.to_json(orient='split')
    #print(stock_dataset_json)
    y_df = stock_dataset_df[target_var].values
    x_df = stock_dataset_df.drop([target_var], axis=1)
    print(y_df)
    ExperimentName = request.json['ExperimentName']
    tasks = request.json['tasks']
    iterations = request.json['iterations']
    n_cross_validations = request.json['n_cross_validations']
    iteration_timeout_minutes = request.json['iteration_timeout_minutes']
    primary_metric = request.json['primary_metric']
    max_concurrent_iterations = request.json['max_concurrent_iterations']
    best_model = request.json['best_model']

    #n_cross_validations = request.json['n_cross_validations']

    try:
        automl_settings = {
            "name": ExperimentName,
            "iteration_timeout_minutes": iteration_timeout_minutes,
            "iterations": iterations,
            "n_cross_validations": n_cross_validations,
            "primary_metric": primary_metric,
            "preprocess": True,
            "max_concurrent_iterations": max_concurrent_iterations,
            "verbosity": logging.INFO
        }

        automl_config = AutoMLConfig(
            task=tasks,
            debug_log='automl_errors.log',
            path=
            'D:\\Stock_Prediction\\AutoML_Azure\\python\\Flask_API_Azure\\log',
            #compute_target = 'Automlvm',
            X=x_df,
            y=y_df,
            **automl_settings,
        )

        experiment = Experiment(ws, ExperimentName)
        remote_run = experiment.submit(automl_config, show_output=True)
        best_run, fitted_model = remote_run.get_output()
        #print(best_run)
        print(best_run.get_file_names())
        #Register the model
        from datetime import date
        model = best_run.register_model(model_name=best_model +
                                        str(date.today()),
                                        model_path='outputs/model.pkl')
        print(model.name, model.id, model.version, sep='\t')
        children = list(remote_run.get_children())
        metricslist = {}
        for run in children:
            properties = run.get_properties()
            metrics = {
                k: v
                for k, v in run.get_metrics().items() if isinstance(v, float)
            }
            metricslist[int(properties['iteration'])] = metrics

        rundata = pd.DataFrame(metricslist).sort_index(1)
        rundata.rename(column={
            0: "one",
            1: "two",
            2: "three",
            3: "four",
            4: "five",
            5: "six",
            6: "seven",
            7: "right",
            8: "nine",
            9: "ten",
        },
                       inplace=True)
        rundata_toJson = rundata.to_json(orient='columns')
        print(rundata_toJson)
        return rundata_toJson
    except:

        return 'error'
def Prediction():
    subscription_id = request.json['subscription_id']
    resource_group = request.json['resource_group']
    workspace_name = request.json['workspace_name']
    location = request.json['location']
    file_name = request.json['file_name']
    target_var = request.json['target_var']
    best_model = request.json['best_model']
    Model_path = request.json['Model_path']

    ws = Workspace(subscription_id=subscription_id,
                   resource_group=resource_group,
                   workspace_name=workspace_name)

    print("Found workspace {} at location {}".format(ws.name, ws.location))
    print('Found existing Workspace.')

    dataset_name = file_name
    # Get a dataset by name
    df = Dataset.get_by_name(workspace=ws, name=dataset_name)
    stock_dataset_df = df.to_pandas_dataframe()
    print('file successfully recieved.')
    X = df.drop_columns(columns=[target_var])
    y = df.keep_columns(columns=[target_var], validate=True)
    y_df = stock_dataset_df[target_var].values
    x_df = stock_dataset_df.drop([target_var], axis=1)
    print(y)

    #from azureml.core import Run
    #experiment=Experiment(ws, workspace_name)
    #from azureml.core.model import Model
    #model = Model(ws, name=Model_path)
    #model.download(exist_ok=True)
    from sklearn.externals import joblib
    cwd = 'D:\DCSAIAUTOML\BestModels\Azure'
    model_path = os.path.join(cwd, Model_path, best_model, "outputs")
    #model_path1 = os.path.join(model_path, "outputs", "model.pkl")
    print(model_path)
    os.chdir(model_path)
    model = joblib.load('model.pkl')
    #best_run = Run(experiment=experiment, run_id='AutoML_74e9d9dc-f347-4392-b8bb-3edeb4a6afad_8')
    #fitted_model = Run(experiment=experiment, run_id='AutoML_74e9d9dc-f347-4392-b8bb-3edeb4a6afad_8')
    print(model)
    try:
        y_predict = model.predict(x_df)
        print(y_predict)
        #prediction_toJson = y_predict.to_json(orient='columns')
        #print(prediction_toJson)
        df = pd.DataFrame(y_predict)
        df.rename(columns={0: "Prediction"}, inplace=True)
        #stock_df = stock_dataset_df[['SepalLengthCm','SepalWidthCm','Species']]
        result = pd.concat([stock_dataset_df, df], axis=1)
        result.to_csv(
            'D:\\PredictionResult\\Azure\\prediction_azure_health.csv',
            index=False,
            date_format='%Y%m%d')
        result.head()
        prediction_toJson = result.to_json(orient='records')
        return prediction_toJson

    except Exception as e:
        error_statement = str(e)
        print("Error statement: ", error_statement)
        return error_statement
Exemple #25
0
def list_datasets_in_workspace(workspace=None, logger=None):
    registrations = Dataset.get_all(workspace).registrations
    return [_registration_to_printable(r) for r in registrations]
def RunAutoMLReg():
    subscription_id = request.json['subscription_id']
    resource_group = request.json['resource_group']
    workspace_name = request.json['workspace_name']
    file_name = request.json['file_name']
    location = request.json['location']
    target_var = request.json['target_var']
    cluster_name = request.json['cluster_name']
    best_model = request.json['best_model']
    #best_model = request.json['best_model']

    ws = Workspace(subscription_id=subscription_id,
                   resource_group=resource_group,
                   workspace_name=workspace_name)

    print("Found workspace {} at location {}".format(ws.name, ws.location))
    print('Found existing Workspace.')
    #compute_target = AmlCompute(ws, cluster_name)
    compute_target = ws.compute_targets[cluster_name]
    print('Found existing AML compute context.')
    dataset_name = file_name

    # Get a dataset by name
    df = Dataset.get_by_name(workspace=ws, name=dataset_name)
    #stock_dataset_df = df.to_pandas_dataframe()
    print('file successfully recieved.')
    #stock_dataset_df.head()
    #stock_dataset_json = stock_dataset_df.to_json(orient='split')
    #print(stock_dataset_json)
    X = df.drop_columns(columns=[target_var])
    y = df.keep_columns(columns=[target_var], validate=True)
    #y_df = stock_dataset_df[target_var].values
    #x_df = stock_dataset_df.drop([target_var], axis=1)
    print(y)
    # create a new RunConfig object
    conda_run_config = RunConfiguration(framework="python")
    conda_run_config.environment.docker.enabled = True
    conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE
    cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'],
                                  conda_packages=['numpy', 'py-xgboost<=0.90'])
    conda_run_config.environment.python.conda_dependencies = cd
    print('run config is ready')
    ExperimentName = request.json['ExperimentName']
    tasks = request.json['tasks']
    iterations = request.json['iterations']
    n_cross_validations = request.json['n_cross_validations']
    iteration_timeout_minutes = request.json['iteration_timeout_minutes']
    primary_metric = request.json['primary_metric']
    max_concurrent_iterations = request.json['max_concurrent_iterations']

    try:
        automl_settings = {
            "name": ExperimentName,
            "iteration_timeout_minutes": iteration_timeout_minutes,
            "featurization": 'auto',
            "iterations": iterations,
            "n_cross_validations": n_cross_validations,
            "primary_metric": primary_metric,
            "preprocess": True,
            "max_concurrent_iterations": max_concurrent_iterations
            #"verbosity": logging.INFO
        }

        automl_config = AutoMLConfig(
            task=tasks,
            debug_log='automl_errors.log',
            blacklist_models=['XGBoost'],
            #path=os.getcwd(),
            compute_target=compute_target,
            #run_configuration=conda_run_config,
            X=X,
            y=y,
            **automl_settings,
        )

        experiment = Experiment(ws, ExperimentName)
        remote_run = experiment.submit(automl_config, show_output=True)
        remote_run.flush(timeout_seconds=400)
        children = list(remote_run.get_children())
        metricslist = {}
        for run in children:
            properties = run.get_properties()
            metrics = {
                k: v
                for k, v in run.get_metrics().items() if isinstance(v, float)
            }
            metricslist[int(properties['iteration'])] = metrics

        rundata = pd.DataFrame(metricslist).sort_index(axis=1,
                                                       by=primary_metric)
        rundata = rundata.drop([
            'mean_absolute_percentage_error',
            'normalized_median_absolute_error',
            'normalized_root_mean_squared_log_error',
            'root_mean_squared_log_error'
        ])
        rundata.rename(columns={
            0: "one",
            1: "two",
            2: "three",
            3: "four",
            4: "five",
            5: "six",
            6: "seven",
            7: "eight",
            8: "nine",
            9: "ten",
        },
                       inplace=True)
        iterations_toJson = rundata.to_json(orient='columns')
        print(iterations_toJson)
        best_run, fitted_model = remote_run.get_output()
        best_run_toJson = best_run.get_metrics()
        cwd = 'D:/DCSAIAUTOML/BestModels/Azure'
        best_model_name = best_run.name
        model = remote_run.register_model(description=best_model)
        print(model.name, model.id, model.version, sep='\t')
        model_path = os.path.join(cwd, best_model, best_model_name)
        print(model_path)
        #print("Model DownLoad Complete")
        #model = Model(workspace=ws, name=model.name)
        #model.download_files(target_dir=model_path)
        #dict = {}
        #dict['iterations_toJson'] = iterations_toJson
        #dict['best_run_toJson'] = best_run_toJson
        #print(best_run.get_file_names())
        #Register the model
        #from datetime import date

        best_model_id = best_run.name

        var1 = "@"
        var2 = var1 + best_model_id

        Reg_model_name = model.name
        var4 = var1 + Reg_model_name

        best_run.flush(timeout_seconds=3600)
        best_run.download_files(output_directory=model_path)
        # importing required modules
        #import shutil
        #output_path = os.path.join(model_path, best_model_id)
        #dir_name1 = "D:\\DCSAIAUTOML\\BestModels\\Azure\\my_azure_best"
        #dir_name1 = "D:\\DCSAIAUTOML\\BestModels\\Azure\\my_azure_best\\my_azure_best"
        #shutil.make_archive(model_path,'zip',model_path)

        #zipf = zipfile.ZipFile(best_model_id+'.zip', 'w', zipfile.ZIP_DEFLATED)
        #for root, dirs, files in os.walk(model_path):
        #for file in files:
        #zipf.write(os.path.join(root, file))

        #def zipdir(path, ziph):
        # ziph is zipfile handle
        #import os
        #for root, dirs, files in os.walk(path):
        #for file in files:
        #ziph.write(os.path.join(root, file))

        #zipdir(model_path, zipf)
        #remote_run.clean_preprocessor_cache()
        print("ready to return")
        var5 = "no exception"
        return '{} {} {} {} {}'.format(iterations_toJson, var2, var4, var1,
                                       var5)
        #return iterations_toJson
    except Exception as e:
        error_statement = str(e)
        print("Error statement: ", error_statement)
        model_path1 = os.path.join(model_path, 'outputs')
        file_name = 'model.pkl'
        print("in exception: ", model_path1)
        src = 'D:\\Final Script_dev'
        full_file_name = os.path.join(src, file_name)
        import shutil
        #remote_run.download_file('model.pkl', output_file_path=model_path1)
        if os.path.isfile(full_file_name):
            shutil.copy(full_file_name, model_path1)
        return '{} {} {} {} {}'.format(iterations_toJson, var2, var4, var1,
                                       error_statement)
def RunAutoMLForecast():
    subscription_id = request.json['subscription_id']
    resource_group = request.json['resource_group']
    workspace_name = request.json['workspace_name']
    file_name = request.json['file_name']
    location = request.json['location']
    target_var = request.json['target_var']
    cluster_name = request.json['cluster_name']
    best_model = request.json['best_model']
    time_column_name = request.json['time_column_name']
    max_horizon = request.json['max_horizon']

    ws = Workspace(subscription_id=subscription_id,
                   resource_group=resource_group,
                   workspace_name=workspace_name)

    print("Found workspace {} at location {}".format(ws.name, ws.location))
    print('Found existing Workspace.')
    compute_target = AmlCompute(ws, cluster_name)
    print('Found existing AML compute context.')
    dataset_name = file_name
    time_column_name = time_column_name
    # Get a dataset by name
    dataset = Dataset.get_by_name(workspace=ws,
                                  name=dataset_name).with_timestamp_columns(
                                      fine_grain_timestamp=time_column_name)
    print(dataset)
    #df_ts = Dataset.Tabular.from_delimited_files(df_ts)
    dataset.to_pandas_dataframe().describe()
    dataset.take(3).to_pandas_dataframe()
    print(dataset)
    #y_df = df_ts[target_var].values
    #x_df = df_ts.drop([target_var], axis=1)
    print('file successfully recieved.')
    #stock_dataset_df.head()
    # create a new RunConfig object
    conda_run_config = RunConfiguration(framework="python")
    conda_run_config.environment.docker.enabled = True
    conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE
    cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'],
                                  conda_packages=['numpy', 'py-xgboost<=0.80'])
    conda_run_config.environment.python.conda_dependencies = cd
    print('run config is ready')
    ExperimentName = request.json['ExperimentName']
    tasks = request.json['tasks']
    iterations = request.json['iterations']
    n_cross_validations = request.json['n_cross_validations']
    iteration_timeout_minutes = request.json['iteration_timeout_minutes']
    primary_metric = request.json['primary_metric']
    #max_concurrent_iterations = request.json['max_concurrent_iterations']

    automl_settings = {
        'time_column_name': time_column_name,
        'max_horizon': max_horizon,
        "iterations": iterations,
    }

    automl_config = AutoMLConfig(
        task=tasks,
        primary_metric=primary_metric,
        #blacklist_models = ['ExtremeRandomTrees', 'AutoArima', 'Prophet'],
        experiment_timeout_minutes=iteration_timeout_minutes,
        training_data=dataset,
        label_column_name=target_var,
        compute_target=compute_target,
        enable_early_stopping=True,
        n_cross_validations=n_cross_validations,
        #verbosity=logging.INFO,
        **automl_settings)
    print("AutoML config created.")
    experiment = Experiment(ws, ExperimentName)
    remote_run = experiment.submit(automl_config, show_output=True)
    children = list(remote_run.get_children())
    metricslist = {}
    for run in children:
        properties = run.get_properties()
        metrics = {
            k: v
            for k, v in run.get_metrics().items() if isinstance(v, float)
        }
        metricslist[int(properties['iteration'])] = metrics

    rundata = pd.DataFrame(metricslist).sort_index(axis=1, by=primary_metric)
    rundata.rename(columns={
        0: "one",
        1: "two",
        2: "three",
        3: "four",
        4: "five",
        5: "six",
        6: "seven",
        7: "eight",
        8: "nine",
        9: "ten",
    },
                   inplace=True)
    iterations_toJson = rundata.to_json(orient='columns')
    print(iterations_toJson)
    best_run, fitted_model = remote_run.get_output()
    #best_run_toJson = best_run.get_metrics()
    #dict = {}
    #dict['iterations_toJson'] = iterations_toJson
    #dict['best_run_toJson'] = best_run_toJson
    #print(best_run.get_file_names())
    #Register the model
    #from datetime import date
    model = remote_run.register_model(model_name=best_model,
                                      description='AutoML Model')
    print(model.name, model.id, model.version, sep='\t')
    best_model = model.name
    best_model
    var1 = "@"
    var2 = var1 + best_model
    return '{} {}'.format(iterations_toJson, var2)
        return sum(edge_lengths)
    else:
        raise ValueError('Path not foun')


GRAPH_FILE_PATH = "https://grab5033896937.blob.core.windows.net/azureml/Dataset/grab/singapore.graphml"

try:
    # load workspace configuration from the config.json file in the current folder.
    #ws = Workspace.from_config()

    ws = Workspace.get(name="<<Insert Name>>",
                       subscription_id="<<Insert Subscription Id>>",
                       resource_group="<<Insert Resource Group>>")

    dataset = Dataset.get_by_name(ws, 'sg_graphml')

    # list the files referenced by sg_graphml dataset
    GRAPH_FILE_PATH = dataset.to_path()

    G = ox.load_graphml(GRAPH_FILE_PATH)
except:
    G = ox.graph_from_place('Singapore', network_type='drive')
    ox.save_graphml(G, filepath=GRAPH_FILE_PATH)


def init():
    global model
    # Get the path where the deployed model can be found.
    model_path = Model.get_model_path('grab-model-reg')
    model = joblib.load(model_path)
Exemple #29
0
from sklearn.linear_model import LogisticRegression
import argparse
import os
import numpy as np
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.model_selection import train_test_split
import pandas as pd
from azureml.core.run import Run
from azureml.core.dataset import Dataset
from azureml.core import Experiment, Workspace
from azureml.data.dataset_factory import TabularDatasetFactory

# Load dataset into data variable:
ws = Workspace.get("quick-starts-ws-126078")
ds = Dataset.get_by_name(ws, name='Heart-Failure')
data = ds.to_pandas_dataframe()

#Split Target and Features in y and x respectively in the clean_data function


def clean_data(data):

    y = data['DEATH_EVENT']
    x = data.drop(['DEATH_EVENT'], axis=1)
    return x, y


x, y = clean_data(data)

# Split data into train and test sets.
def main():

    run = Run.get_context()
    ws = run.experiment.workspace

    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument("--learning_rate",
                        type=float,
                        default=0.3,
                        help="Boosting learning rate (xgb's 'eta')")
    parser.add_argument("--n_estimators",
                        type=int,
                        default=100,
                        help="Number of boosting rounds")
    parser.add_argument("--max_depth",
                        type=int,
                        default=6,
                        help="Maximum tree depth for base learners")
    parser.add_argument(
        "--min_child_weight",
        type=int,
        default=1,
        help="Minimum sum of instance weight(hessian) needed in a child")
    parser.add_argument(
        "--gamma",
        type=float,
        default=0,
        help=
        "Minimum loss reduction required to make a further partition on a leaf node of the tree"
    )
    parser.add_argument("--subsample",
                        type=float,
                        default=1.0,
                        help="Subsample ratio of the training instance")
    parser.add_argument(
        "--colsample_bytree",
        type=float,
        default=1.0,
        help="Subsample ratio of columns when constructing each tree")
    parser.add_argument("--reg_lambda",
                        type=float,
                        default=1.0,
                        help="L1 regularization term on weights")
    parser.add_argument("--reg_alpha",
                        type=float,
                        default=0,
                        help="L2 regularization term on weights")

    args = parser.parse_args()

    run.log("learning_rate:", np.float(args.learning_rate))
    run.log("n_estimators:", int(args.n_estimators))
    run.log("max_depth:", int(args.max_depth))
    run.log("min_child_weight:", int(args.min_child_weight))
    run.log("gamma:", np.float(args.gamma))
    run.log("subsample:", np.float(args.subsample))
    run.log("colsample_bytree:", np.float(args.colsample_bytree))
    run.log("reg_lambda:", np.float(args.reg_lambda))
    run.log("reg_alpha:", np.float(args.reg_alpha))

    dataset = Dataset.get_by_name(ws, name="attrition_train")
    df = dataset.to_pandas_dataframe()

    X_train, X_val, y_train, y_val = data_prep(df)

    clf = XGBClassifier(learning_rate=args.learning_rate,
                        n_estimators=args.n_estimators,
                        max_depth=args.max_depth,
                        min_child_weight=args.min_child_weight,
                        gamma=args.gamma,
                        subsample=args.subsample,
                        colsample_bytree=args.colsample_bytree,
                        reg_lambda=args.reg_lambda,
                        reg_alpha=args.reg_alpha)

    clf.fit(X_train, y_train)

    accuracy = np.round(clf.score(X_val, y_val), 3)
    run.log("accuracy", np.float(accuracy))

    auc_weighted = np.round(
        roc_auc_score(y_val, clf.predict(X_val), average='weighted'), 3)
    run.log("AUC_weighted", np.float(auc_weighted))

    os.makedirs("outputs", exist_ok=True)
    # files saved in the "outputs" folder are automatically uploaded into run history
    joblib.dump(clf, "outputs/hyperdrive_model.pkl")