Beispiel #1
0
    def is_stale(self):
        """Return boolean to describe whether the computed profile is stale or not.

        A Profile is considered to be stale if there is changed in underlying data after the
        profile is computed.
        - if the data source change cannot be detected, TypeError is raised.
        - if the data source was changed after submitting the profile run, the flag will be True;
        - otherwise, the profile matches current data, and the flag will be False.

        :return: boolean to describe whether the computed profile is stale or not.
        :rtype: bool
        """
        from azureml.core import Dataset
        dataset = Dataset.get_by_id(self._workspace, id=self._saved_dataset_id)
        workspace = dataset._ensure_workspace(self._workspace)

        request_dto = ActionRequestDto(
            action_type=_ACTION_TYPE_PROFILE,
            saved_dataset_id=dataset._ensure_saved(workspace),
            arguments={'generate_preview': 'True', 'row_count': '1000'})

        action_result_dto = _restclient(workspace).dataset.get_action_result(
            workspace.subscription_id,
            workspace.resource_group,
            workspace.name,
            dataset_id=_LEGACY_DATASET_ID,
            request=request_dto,
            custom_headers=_custom_headers)

        if action_result_dto.is_up_to_date is None:
            raise AzureMLException(action_result_dto.is_up_to_date_error)

        return not action_result_dto.is_up_to_date
Beispiel #2
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument('--n_estimators', type=int, default=100, help="Number of trees in the forest.")
    parser.add_argument('--max_depth', type=int, default=None, help="Maximum depth of tree.")
    parser.add_argument('--input_data', type=str)

    args = parser.parse_args()

    dataset = Dataset.get_by_id(ws, id=args.input_data)

    # Drop NAs and encode data.
    x, y = process_data(dataset)

    #Split data into train and test sets.
    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.33)

    run.log("Number of Estimators:", np.int(args.n_estimators))
    run.log("Max iterations:", np.int(args.max_depth))

    model = RandomForestClassifier(n_estimators=args.n_estimators, max_depth=args.max_depth).fit(x_train, y_train)

    roc_auc = roc_auc_score(model.predict(x_test), y_test)
    run.log("auc", np.float(roc_auc))
    
    os.makedirs('outputs', exist_ok=True)
    joblib.dump(value=model, filename='outputs/model.pkl')
Beispiel #3
0
def update_output_lineage(workspace, output_datasets):
    if not output_datasets:
        return

    for value in output_datasets:
        value['dataset'] = Dataset.get_by_id(workspace,
                                             value['identifier']['savedId'])
def find_set(set_id):
    """
    Find a dataset in the inputs by its ID, required for Tabular Datasets.

    :param set_id:      ID of the dataset to load.
    :returns:           The found dataset.
    """
    return Dataset.get_by_id(Run.get_context().experiment.workspace, set_id)
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()
    parser.add_argument("--input-data", type=str)
    parser.add_argument('--n_estimators',
                        type=int,
                        default=100,
                        help="The number of boosting stages to perform")
    parser.add_argument(
        '--learning_rate',
        type=float,
        default=0.1,
        help=
        "Learning rate shrinks the contribution of each tree by learning_rate")

    args = parser.parse_args()

    run = Run.get_context()
    # Get the dataset from run inputs
    #ds = run.input_datasets['dataset']

    ws = run.experiment.workspace
    # get the input dataset by ID
    ds = Dataset.get_by_id(ws, id=args.input_data)

    df = ds.to_pandas_dataframe()
    erp_mean = df['ERP'].mean()
    x, y = split_train_label_data(df)

    # Split data into train and test sets: 20% of the dataset to include in the test split.
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)

    run.log("n_estimators:", np.int(args.n_estimators))
    run.log("learning_rate:", np.float(args.learning_rate))

    model = GradientBoostingRegressor(n_estimators=args.n_estimators,
                                      learning_rate=args.learning_rate,
                                      max_depth=1,
                                      random_state=0,
                                      loss='huber').fit(x_train, y_train)
    # MSE
    mse = mean_squared_error(y_test, model.predict(x_test))

    # normalized_root_mean_squared_error => to be comparabe with Azure results
    metric = math.sqrt(mse) / erp_mean

    run.log("normalized_root_mean_squared_error", np.float(metric))
    # Metric reported is 'r2_score' => metric to optimize
    run.log("r2_score", np.float(model.score(x_test, y_test)))

    os.makedirs('outputs', exist_ok=True)
    # Save the model into run history
    joblib.dump(model, 'outputs/model.joblib')
Beispiel #6
0
def get_data(dataset_id):
    '''Get the dataset and perform general preparation.'''
    run = Run.get_context()
    ws = run.experiment.workspace # pylint: disable=invalid-name
    dataset = Dataset.get_by_id(ws, id=dataset_id)
    df = dataset.to_pandas_dataframe() # pylint: disable=invalid-name
    # Keep only text content
    train_docs = df.iloc[:, 2].values
    # Remove empty elements (?)
    train_docs = [d for d in train_docs if d]
    return train_docs
Beispiel #7
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument('--n-estimators', type=int, default=100)
    parser.add_argument('--max-depth', type=int, default=5)
    parser.add_argument("--input-data", type=str)

    args = parser.parse_args()

    run = Run.get_context()

    run.log("Num estimators:", np.int(args.n_estimators))
    run.log("Max depth:", np.int(args.max_depth))

    ws = run.experiment.workspace
    # get the input dataset by ID
    dataset = Dataset.get_by_id(ws, id=args.input_data)
    # load the TabularDataset to pandas DataFrame
    df = dataset.to_pandas_dataframe()

    X = df.drop(columns=['drugs_related_stop'])
    y = df['drugs_related_stop']

    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=42)
    model = RandomForestClassifier(n_estimators=args.n_estimators,
                                   max_depth=args.max_depth).fit(
                                       x_train, y_train)

    accuracy = model.score(x_test, y_test)
    run.log("accuracy", np.float(accuracy))
    value = {
        "schema_type": "confusion_matrix",
        "schema_version": "v1",
        "data": {
            "class_labels": ["0", "1"],
            "matrix": confusion_matrix(y_test, model.predict(x_test)).tolist()
        }
    }
    run.log_confusion_matrix(name='Confusion Matrix', value=value)
    os.makedirs('outputs', exist_ok=True)
    # note file saved in the outputs folder is automatically uploaded into experiment record
    joblib.dump(value=model, filename='outputs/model.pkl')
def register_aml_model(
    model_path,
    model_name,
    model_tags,
    exp,
    run_id,
    dataset_id,
    build_id: str = 'none',
    build_uri=None
):
    try:
        tagsValue = {"area": "aml_recommender",
                     "run_id": run_id,
                     "experiment_name": exp.name}
        tagsValue.update(model_tags)
        if (build_id != 'none'):
            model_already_registered(model_name, exp, run_id)
            tagsValue["BuildId"] = build_id
            if (build_uri is not None):
                tagsValue["BuildUri"] = build_uri

        model = AMLModel.register(
            workspace=exp.workspace,
            model_name=model_name,
            model_path=model_path,
            tags=tagsValue,
            datasets=[('training data',
                       Dataset.get_by_id(exp.workspace, dataset_id))])
        os.chdir("..")
        print(
            "Model registered: {} \nModel Description: {} "
            "\nModel Version: {}".format(
                model.name, model.description, model.version
            )
        )
    except Exception:
        traceback.print_exc(limit=None, file=None, chain=True)
        print("Model registration failed")
        raise
Beispiel #9
0
def register_aml_model(run_id,
                       exp,
                       model_tags,
                       model_name,
                       model_path,
                       dataset_id,
                       build_id=None,
                       build_uri=None):
    try:
        tags_value = {
            'area': 'diabetes_regression',
            'run_id': run_id,
            'experiment_name': exp.name
        }
        tags_value.update(model_tags)
        if build_id is not None:
            model_already_registered(model_name, run_id, exp)
            tags_value['BuildId'] = build_id
            if build_uri is not None:
                tags_value['BuildUri'] = build_uri

        model = Model.register(workspace=exp.workspace,
                               model_path=model_path,
                               tags=tags_value,
                               model_name=model_name,
                               datasets=[
                                   ('training_data',
                                    Dataset.get_by_id(exp.workspace,
                                                      dataset_id))
                               ])
        print(
            f'{model_name} has been registered,\nmodel description: {model.description},\nmodel version: {model.version}'
        )
    except Exception:
        traceback.print_exc(limit=None, file=None, chain=True)
        print('model registration failed!')
        raise
Beispiel #10
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_data",
                        type=str,
                        help="Id of the registered train dataset")
    parser.add_argument('--n_estimators',
                        type=int,
                        default=100,
                        help="Number of estimators")
    parser.add_argument('--max_depth',
                        type=int,
                        default=6,
                        help="Maximum depth of the trees")

    args = parser.parse_args()

    run.log("Number of estimators:", np.float(args.n_estimators))
    run.log("Max depth:", np.int(args.max_depth))

    # Create TabularDataset

    dataset = Dataset.get_by_id(ws, id=args.input_data)

    X_train, X_test, y_train, y_test = clean_data(dataset)

    model = XGBClassifier(n_estimators=args.n_estimators,
                          max_depth=args.max_depth).fit(X_train, y_train)

    #saving the model
    os.makedirs("outputs", exist_ok=True)
    filename = 'outputs/model.pkl'
    pickle.dump(model, open(filename, 'wb'))

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    run.log("Accuracy", np.float(accuracy))
args = parser.parse_args()
target_column_name = args.target_column_name
model_name = args.model_name

print("args passed are: ")
print("Target column name: ", target_column_name)
print("Name of registered model: ", model_name)

model_path = Model.get_model_path(model_name)
# deserialize the model file back into a sklearn model
model = joblib.load(model_path)

run = Run.get_context()

test_dataset = Dataset.get_by_id(run.experiment.workspace, id=args.input_data)

X_test_df = test_dataset.drop_columns(
    columns=[target_column_name]).to_pandas_dataframe()
y_test_df = (test_dataset.with_timestamp_columns(None).keep_columns(
    columns=[target_column_name]).to_pandas_dataframe())

predicted = model.predict_proba(X_test_df)

if isinstance(predicted, pd.DataFrame):
    predicted = predicted.values

# Use the AutoML scoring module
train_labels = model.classes_
class_labels = np.unique(
    np.concatenate((y_test_df.values, np.reshape(train_labels, (-1, 1)))))
def parse_id_to_dataset(dataset_id):
    run = Run.get_context()
    ws = run.experiment.workspace
    return Dataset.get_by_id(ws, id=dataset_id)
Beispiel #13
0
env.python.conda_dependencies = packages

script_config = ScriptRunConfig(source_directory='my_dir',
                                script='script.py',
                                arguments=['--ds', tab_ds],
                                environment=env)

#Script
from azureml.core import Run, Dataset

parser.add_argument('--ds', type=str, dest='dataset_id')
args = parser.parse_args()

run = Run.get_context()
ws = run.experiment.workspace
dataset = Dataset.get_by_id(ws, id=args.dataset_id)
data = dataset.to_pandas_dataframe()



#use named input
env = Environment('my_env')
packages = CondaDependencies.create(conda_packages=['pip'],
                                    pip_packages=['azureml-defaults',
                                                  'azureml-dataprep[pandas]'])
env.python.conda_dependencies = packages

script_config = ScriptRunConfig(source_directory='my_dir',
                                script='script.py',
                                arguments=['--ds', tab_ds.as_named_input('my_dataset')],
                                environment=env)
Beispiel #14
0
import argparse
from azureml.core import Dataset, Run

parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str)
args = parser.parse_args()

run = Run.get_context()
ws = run.experiment.workspace

# get the input dataset by ID
dataset = Dataset.get_by_id(ws, id=args.input_data)

# load the TabularDataset to pandas DataFrame
df = dataset.to_pandas_dataframe()
parser.add_argument("--useful_columns",
                    type=str,
                    help="useful columns to keep")
parser.add_argument("--columns", type=str, help="rename column pattern")

args = parser.parse_args()

print("Argument 1(raw data id): %s" % args.raw_data)
print("Argument 2(columns to keep): %s" %
      str(args.useful_columns.strip("[]").split(";")))
print("Argument 3(columns renaming mapping): %s" %
      str(args.columns.strip("{}").split(";")))
print("Argument 4(output cleansed taxi data path): %s" % args.output_cleanse)

run = Run.get_context()
raw_data = Dataset.get_by_id(run.experiment.workspace, id=args.raw_data)

# These functions ensure that null data is removed from the dataset,
# which will help increase machine learning model accuracy.

useful_columns = [
    s.strip().strip("'") for s in args.useful_columns.strip("[]").split(";")
]
columns = get_dict(args.columns)

new_df = (raw_data.to_pandas_dataframe().dropna(how='all').rename(
    columns=columns))[useful_columns]

new_df.reset_index(inplace=True, drop=True)

if not (args.output_cleanse is None):
    dest="target_column_name",
    help="Target Column Name",
)
parser.add_argument(
    "--test_dataset", type=str, dest="test_dataset", help="Test Dataset"
)

args = parser.parse_args()
target_column_name = args.target_column_name
test_dataset_id = args.test_dataset

run = Run.get_context()
ws = run.experiment.workspace

# get the input dataset by id
test_dataset = Dataset.get_by_id(ws, id=test_dataset_id)

X_test = (
    test_dataset.drop_columns(columns=[target_column_name])
    .to_pandas_dataframe()
    .reset_index(drop=True)
)
y_test_df = (
    test_dataset.with_timestamp_columns(None)
    .keep_columns(columns=[target_column_name])
    .to_pandas_dataframe()
)

# generate forecast
fitted_model = joblib.load("model.pkl")
# We have default quantiles values set as below(95th percentile)
Beispiel #17
0
from azureml.core import Run, Dataset
from azureml.data._dataset_client import _DatasetClient

if __name__ == '__main__':
    dataset_id = sys.argv[1]
    action_id = sys.argv[2]
    saved_dataset_id = sys.argv[3]

    print('Start execution with action_id = {0}, dataset_id = {1} and '
          'saved_dataset_id = {2}'.format(action_id, dataset_id,
                                          saved_dataset_id))

    workspace = Run.get_context().experiment.workspace
    if workspace is None:
        raise TypeError('Workspace is found to be None')

    dataflow_json = None
    if saved_dataset_id:
        try:
            dataset = Dataset.get_by_id(workspace, saved_dataset_id)
        except Exception:
            errorMsg = 'Failed to get the dataset details by saved dataset id {}'.format(
                saved_dataset_id)
            print(errorMsg)
            raise TypeError(errorMsg)

    dataflow_json = dataset._dataflow.to_json()

    _DatasetClient._execute_dataset_action(workspace, dataset_id, action_id,
                                           dataflow_json)
Beispiel #18
0
args = parser.parse_args()
min_df = args.min_df
max_df = args.max_df
ngram_min = args.ngram_min
ngram_max = args.ngram_max
n_estimators = args.n_estimators
max_depth = args.max_depth
min_samples_split = args.min_samples_split
min_class_frequency = args.min_class_frequency
input_data = args.input_data

run = Run.get_context()
ws = run.experiment.workspace

# get the input dataset by ID
dataset = Dataset.get_by_id(ws, id=input_data)
run.log("Dataset Version", dataset.version)
data_ml = dataset.to_pandas_dataframe()

# take only data which KKS has more than 5 relating records
data_ml = data_ml.groupby('target').filter(lambda x: len(x) >= min_class_frequency)
# drop nulls
data_ml = data_ml[["target", "tweet"]].dropna().copy()

vec = TfidfVectorizer(analyzer='char_wb', ngram_range=(ngram_min, ngram_max), min_df=min_df, max_df=max_df)

X = vec.fit_transform(data_ml['tweet'])

y = data_ml["target"]
run.log("Number of classes", len(set(y)))
Beispiel #19
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser(description="hyperparameters of the logistic regression model")
    parser.add_argument('--test-set', type=str,
                        help="Name of your test set")
    parser.add_argument('--train-set', type=str,
                        help="Name of your training set")
    parser.add_argument('--max-depth', type=int, default=3,
                        help="How deep is the tree growing during one round of boosting")
    parser.add_argument('--min-child-weight', type=int,
                        default=2,
                        help="Minimum sum of weight for all observations in a child. Controls overfitting")
    parser.add_argument('--gamma', type=float,
                        default=0,
                        help="Gamma corresponds to the minimum loss reduction required to make a split.")
    parser.add_argument('--subsample', type=float,
                        default=0.9,
                        help="What fraction of samples are randomly sampled per tree.")
    parser.add_argument('--colsample-bytree', type=float,
                        default=0.8,
                        help="What fraction of feature columns are randomly sampled per tree.")
    parser.add_argument('--reg-alpha', type=float,
                        default=0.00001,
                        help="L1 regularization of the weights. Increasing the values more strongly prevents "
                             "overfitting.")
    parser.add_argument('--eta', type=float,
                        default=0.2,
                        help="Learning rate for XGBoost.")
    parser.add_argument('--seed', type=int,
                        default=42,
                        help="Random seed.")
    parser.add_argument('--num-iterations', type=int,
                        default=20,
                        help="Number of fitting iterations")

    args = parser.parse_args()

    params = {
        'eta': args.eta,
        'max_depth': args.max_depth,
        'min_child_weight': args.min_child_weight,
        'gamma': args.gamma,
        'subsample': args.subsample,
        'colsample_bytree': args.colsample_bytree,
        'reg_alpha': args.reg_alpha,
        'seed': args.seed,
        'objective': 'multi:softmax',
        'num_class': 3,
    }

    run.log("max depth:", np.int(args.max_depth))
    run.log("min_child_weight:", np.float(args.min_child_weight))
    run.log("gamma", np.float(args.gamma))
    run.log("subsample:", np.float(args.subsample))
    run.log("colsample_bytree:", np.float(args.colsample_bytree))
    run.log("reg alpha:", np.float(args.reg_alpha))
    run.log("learning rate:", np.float(args.eta))

    # Load the Training Dataset and the Test Dataset
    ws = run.experiment.workspace
    dataset_training = Dataset.get_by_id(ws, id=args.train_set)
    dataset_test = Dataset.get_by_id(ws, id=args.test_set)
    run.log("loaded_dataset", str(dataset_test))

    train_df = dataset_training.to_pandas_dataframe()
    test_df = dataset_test.to_pandas_dataframe()

    # Convert the training and test sets to sparse matrices and then create a xgboost DMatrix for efficient computation
    x_train = scipy.sparse.csr_matrix(train_df.drop(columns=['norm_rating']).to_numpy())
    y_train = list(train_df.norm_rating)
    x_test = scipy.sparse.csr_matrix(test_df.drop(columns=['norm_rating']).to_numpy())
    y_test = list(test_df.norm_rating)

    dtrain = xgb.DMatrix(x_train, label=y_train)
    dtest = xgb.DMatrix(x_test, label=y_test)

    # Train the model
    watchlist = [(dtest, 'eval'), (dtrain, 'train')]
    num_round = args.num_iterations
    model = xgb.train(params, dtrain, num_round, watchlist)

    # Evaluate the model
    pred_test = model.predict(dtest)
    pred_train = model.predict(dtrain)

    # Compute F1 scores and Accuracy scores
    f1_score_weighted_train = f1_score(y_train, pred_train, average='weighted')
    accuracy_train = accuracy_score(y_train, pred_train)
    f1_score_weighted = f1_score(y_test, pred_test, average='weighted')
    accuracy = accuracy_score(y_test, pred_test)

    os.makedirs('outputs', exist_ok=True)
    # note file saved in the outputs folder is automatically uploaded into experiment record
    joblib.dump(value=model, filename='outputs/xgboost_model.pkl')

    run.log("F1ScoreWeightedTrain", np.float(f1_score_weighted_train))
    run.log("F1ScoreWeighted", np.float(f1_score_weighted))
    run.log("AccuracyTrain", np.float(accuracy_train))
    run.log("Accuracy", np.float(accuracy))