def register_aml_model(
    model_path,
    model_name,
    model_tags,
    exp,
    run_id,
    dataset_id,
    build_id: str = 'none',
    build_uri=None
):
    try:
        tagsValue = {"area": "sales_forecast",
                     "run_id": run_id,
                     "experiment_name": exp.name}
        tagsValue.update(model_tags)
        if (build_id != 'none'):
            model_already_registered(model_name, exp, run_id)
            tagsValue["BuildId"] = build_id
            if (build_uri is not None):
                tagsValue["BuildUri"] = build_uri

        model = AMLModel.register(
            workspace=exp.workspace,
            model_name=model_name,
            model_path=model_path,
            tags=tagsValue,
            datasets=[('training data',
                       Dataset.get_by_id(exp.workspace, dataset_id))])
        os.chdir("..")
        print(
            "Model registered: {} \nModel Description: {} "
            "\nModel Version: {}".format(
                model.name, model.description, model.version
            )
        )
    except Exception:
        traceback.print_exc(limit=None, file=None, chain=True)
        print("Model registration failed")
        raise
Ejemplo n.º 2
0
def get_or_register_dataset(dataset_name: str,
                            datastore_name: str,
                            data_file_path: str,
                            aml_workspace: Workspace = None) -> Dataset:
    if dataset_name is None:
        raise Exception("Datset name can't be null")

    if aml_workspace is None:
        print("No workspace defined - using current experiment workspace.")
        aml_workspace, *_ = get_aml_context(Run.get_context())

    if data_file_path == "nopath":
        print(f"get latest version of dataset: {dataset_name}")
        dataset = Dataset.get_by_name(aml_workspace, dataset_name)
    else:
        print(
            f"register a new dataset or new version: {dataset_name}, {datastore_name}, {data_file_path}"
        )  # NOQA: E501
        dataset = register_dataset(aml_workspace, dataset_name, datastore_name,
                                   data_file_path)

    return dataset
Ejemplo n.º 3
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--C',
        type=float,
        default=1.0,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization"
    )
    parser.add_argument('--data', type=str, help="Loading dataset")
    parser.add_argument('--max_iter',
                        type=int,
                        default=100,
                        help="Maximum number of iterations to converge")

    args = parser.parse_args()

    # split data to train and test sets
    dataset = Dataset.get_by_name(ws, name='diabetes_data_set')
    dataset = dataset.to_pandas_dataframe()
    x = dataset.drop(columns=['Outcome'])
    y = dataset['Outcome']
    x_train, x_test, y_train, y_test = train_test_split(x, y)

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    model = LogisticRegression(C=args.C,
                               max_iter=args.max_iter).fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)

    os.makedirs('outputs', exist_ok=True)

    joblib.dump(model, 'outputs/model.joblib')

    run.log("Accuracy", np.float(accuracy))
Ejemplo n.º 4
0
def main(experiment, environment, dataset):
    workspace = Workspace.from_config()
    experiment = Experiment(workspace, experiment)
    compute_target = ComputeTarget(workspace, environment)
    
    # Use the root of the solution as source folder for the run.
    root_folder = Path(__file__).parent.parent

    # Provide each of the datasets to the estimator as a named input.
    # You can acccess these from within the training script.
    datasets = [Dataset.get_by_name(workspace, ds).as_named_input(ds) for ds in dataset]

    estimator = SKLearn(
        source_directory=root_folder,
        entry_script='customer_churn/train.py',
        conda_dependencies_file='conda_dependencies.yml',
        compute_target=compute_target,
        inputs=datasets
    )

    run = experiment.submit(estimator)

    run.wait_for_completion(show_output=True)
def register_model(model_name, dataset_name, build_id):
    # Retreive dataset
    if run._run_id.startswith("OfflineRun"):
        workspace = Workspace.from_config()
    else:
        workspace = run.experiment.workspace

    # Retreive train datasets
    train_dataset = [(dataset_name,
                      Dataset.get_by_name(workspace, name=dataset_name))]

    # Get evaluation metric for model
    run_metrics = run.parent.get_metrics()

    # Define model file name
    model_file_name = "model.pkl"

    # Define model tags
    model_tags = {
        "build_id": build_id,
        "test_acccuracy": run_metrics.get(evaluation_metric),
    }

    print("Variable [model_tags]:", model_tags)

    # Register the model
    model = run.parent.register_model(
        model_name=model_name,
        model_path=model_file_name,
        model_framework=Model.Framework.SCIKITLEARN,
        model_framework_version=sklearn.__version__,
        datasets=train_dataset,
        tags=model_tags,
    )

    print("Variable [model]:", model.serialize())
    logger.info(model.serialize())
Ejemplo n.º 6
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_data",
                        type=str,
                        help="Id of the registered train dataset")
    parser.add_argument('--n_estimators',
                        type=int,
                        default=100,
                        help="Number of estimators")
    parser.add_argument('--max_depth',
                        type=int,
                        default=6,
                        help="Maximum depth of the trees")

    args = parser.parse_args()

    run.log("Number of estimators:", np.float(args.n_estimators))
    run.log("Max depth:", np.int(args.max_depth))

    # Create TabularDataset

    dataset = Dataset.get_by_id(ws, id=args.input_data)

    X_train, X_test, y_train, y_test = clean_data(dataset)

    model = XGBClassifier(n_estimators=args.n_estimators,
                          max_depth=args.max_depth).fit(X_train, y_train)

    #saving the model
    os.makedirs("outputs", exist_ok=True)
    filename = 'outputs/model.pkl'
    pickle.dump(model, open(filename, 'wb'))

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    run.log("Accuracy", np.float(accuracy))
Ejemplo n.º 7
0
def register_aml_model(run_id,
                       exp,
                       model_tags,
                       model_name,
                       model_path,
                       dataset_id,
                       build_id=None,
                       build_uri=None):
    try:
        tags_value = {
            'area': 'diabetes_regression',
            'run_id': run_id,
            'experiment_name': exp.name
        }
        tags_value.update(model_tags)
        if build_id is not None:
            model_already_registered(model_name, run_id, exp)
            tags_value['BuildId'] = build_id
            if build_uri is not None:
                tags_value['BuildUri'] = build_uri

        model = Model.register(workspace=exp.workspace,
                               model_path=model_path,
                               tags=tags_value,
                               model_name=model_name,
                               datasets=[
                                   ('training_data',
                                    Dataset.get_by_id(exp.workspace,
                                                      dataset_id))
                               ])
        print(
            f'{model_name} has been registered,\nmodel description: {model.description},\nmodel version: {model.version}'
        )
    except Exception:
        traceback.print_exc(limit=None, file=None, chain=True)
        print('model registration failed!')
        raise
Ejemplo n.º 8
0
    def scale_up(self, workers=1):
        """ Scale up the number of workers.
        """
        run_config = RunConfiguration()
        run_config.target = self.compute_target
        run_config.environment = self.environment_definition

        scheduler_ip = self.run.get_metrics()["scheduler"]
        args = [
            f"--scheduler_ip_port={scheduler_ip}",
            f"--use_gpu={self.use_gpu}",
            f"--n_gpus_per_node={self.n_gpus_per_node}",
            f"--worker_death_timeout={self.worker_death_timeout}",
        ]

        file_dataset_registered_name = self.kwargs.get(
            'file_dataset_registered_name', None)
        dataset_config_name = self.kwargs.get('dataset_config_name', None)
        path_on_compute = self.kwargs.get('path_on_compute', None)
        if path_on_compute is not None:
            dataset = Dataset.get_by_name(workspace=self.workspace,
                                          name=file_dataset_registered_name)
            input1 = dataset.as_named_input(dataset_config_name).as_mount(
                path_on_compute=path_on_compute)
            args.append(input1)

        child_run_config = ScriptRunConfig(
            source_directory=os.path.join(self.abs_path, "setup"),
            script="start_worker.py",
            arguments=args,
            run_config=run_config,
        )

        for i in range(workers):
            child_run = self.run.submit_child(child_run_config, tags=self.tags)
            self.workers_list.append(child_run)
            hostname = socket.gethostname()
Ejemplo n.º 9
0
 def get_or_create_dataset(self, azure_dataset_id: str) -> FileDataset:
     """
     Looks in the AzureML datastore for a dataset of the given name. If there is no such dataset, a dataset is
     created and registered, assuming that the files are in a folder that has the same name as the dataset.
     For example, if azure_dataset_id is 'foo', then the 'foo' dataset should be pointing to the folder
     <container_root>/datasets/foo/
     """
     if not self.azureml_datastore:
         raise ValueError(
             "No value set for 'azureml_datastore' (name of the datastore in the AzureML workspace)"
         )
     if not azure_dataset_id:
         raise ValueError("No dataset ID provided.")
     logging.info(
         f"Retrieving datastore '{self.azureml_datastore}' from AzureML workspace"
     )
     workspace = self.get_workspace()
     datastore = Datastore.get(workspace, self.azureml_datastore)
     try:
         logging.info(
             f"Trying to retrieve AzureML Dataset '{azure_dataset_id}'")
         azureml_dataset = Dataset.get_by_name(workspace,
                                               name=azure_dataset_id)
         logging.info("Dataset found.")
     except:
         logging.info(
             f"Dataset does not yet exist, creating a new one from data in folder '{azure_dataset_id}'"
         )
         # Ensure that there is a / at the end of the file path, otherwise folder that share a prefix could create
         # trouble (for example, folders foo and foo_bar exist, and I'm trying to create a dataset from "foo")
         azureml_dataset = Dataset.File.from_files(path=(datastore,
                                                         azure_dataset_id +
                                                         "/"))
         logging.info("Registering the dataset for future use.")
         azureml_dataset.register(workspace, name=azure_dataset_id)
     return azureml_dataset
Ejemplo n.º 10
0
# Helper file to submit an experiment run
import os
from azureml.core import Workspace, Experiment, Dataset
from azureml.core.model import Model
from azureml.train.estimator import Estimator
from azureml.core.authentication import AzureCliAuthentication
from azureml.data.data_reference import DataReference

# load Azure ML workspace
azureml_workspace = Workspace.from_config(auth=AzureCliAuthentication())

# Retrieve a pointer to the dataset versions
redditcomments_gaming = Dataset.get_by_name(azureml_workspace,
                                            name='redditcomments',
                                            version='latest')

redditcomments = Dataset.get_by_name(azureml_workspace,
                                     name='redditcomments_gaming',
                                     version='latest')

# Configure the training run
est = Estimator(entry_script='train.py',
                script_params={'--alpha': 1.0},
                source_directory=os.path.dirname(os.path.realpath(__file__)),
                compute_target='ml-e2e',
                inputs=[redditcomments_gaming.as_named_input('comments')],
                pip_packages=[
                    "azureml-sdk", "azureml-mlflow", "matplotlib", "scipy",
                    "sklearn", "azure-cli", "pandas", "numpy"
                ])
        df_filtered.shape[0], start_time, end_time))
    return df_filtered


print("Check for new data and prepare the data")

parser = argparse.ArgumentParser("split")
parser.add_argument("--ds_name", help="name of the Dataset to update")
args = parser.parse_args()

print("Argument 1(ds_name): %s" % args.ds_name)

dstor = ws.get_default_datastore()
register_dataset = False
try:
    ds = Dataset.get_by_name(ws, args.ds_name)
    end_time_last_slice = ds.data_changed_time.replace(tzinfo=None)
    print("Dataset {0} last updated on {1}".format(args.ds_name,
                                                   end_time_last_slice))
except Exception as e:
    print(traceback.format_exc())
    print("Dataset with name {0} not found, registering new dataset.".format(
        args.ds_name))
    register_dataset = True
    end_time_last_slice = datetime.today() - relativedelta(weeks=1)

end_time = datetime.utcnow()
train_df = get_noaa_data(end_time_last_slice, end_time)

if train_df.size > 0:
    print("Received {0} rows of new data after {0}.".format(
Ejemplo n.º 12
0
from azureml.pipeline.core import Pipeline, PipelineData

from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
from azureml.pipeline.steps import ParallelRunConfig
from azureml.pipeline.steps import ParallelRunStep

print("SDK version:", azureml.core.VERSION)

dataset_name = 'grib-dataset'

ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')

datastore = ws.get_default_datastore()

input_ds = Dataset.get_by_name(ws, dataset_name)
batch_data = DatasetConsumptionConfig("batch_dataset", input_ds, mode='mount')

output_dir = PipelineData(name='batch_output', datastore=datastore)

parallel_run_config = ParallelRunConfig.load_yaml(workspace=ws,
                                                  path='convert_parallel.yml')

batch_step = ParallelRunStep(name="batch-conversion-step",
                             parallel_run_config=parallel_run_config,
                             arguments=['--data_output_path', output_dir],
                             inputs=[batch_data],
                             output=output_dir,
                             allow_reuse=False)

steps = [batch_step]
Ejemplo n.º 13
0
def main():
    print("Running train.py")

    parser = argparse.ArgumentParser("train")
    parser.add_argument(
        "--build_id",
        type=str,
        help="The build ID of the build triggering this pipeline run",
    )
    parser.add_argument(
        "--model_name",
        type=str,
        help="Name of the Model",
        default="sklearn_regression_model.pkl",
    )

    parser.add_argument(
        "--dataset_name",
        type=str,
        help=("Dataset with the training data")
    )
    args = parser.parse_args()

    print("Argument [build_id]: %s" % args.build_id)
    print("Argument [model_name]: %s" % args.model_name)
    print("Argument [dataset_name]: %s" % args.dataset_name)

    model_name = args.model_name
    build_id = args.build_id
    dataset_name = args.dataset_name

    print("Getting training parameters")

    with open("config.json") as f:
        pars = json.load(f)
    try:
        alpha = pars["training"]["alpha"]
    except KeyError:
        alpha = 0.5

    print("Parameter alpha: %s" % alpha)

    run = Run.get_context()
    ws = run.experiment.workspace

    if (dataset_name):
        dataset = Dataset.get_by_name(workspace=ws, name=dataset_name)
        df = dataset.to_pandas_dataframe()
        X = df.values
        y = df.Y
    else:
        X, y = load_diabetes(return_X_y=True)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0)
    data = {"train": {"X": X_train, "y": y_train},
            "test": {"X": X_test, "y": y_test}}

    reg = train_model(run, data, alpha)

    joblib.dump(value=reg, filename=model_name)

    # upload model file explicitly into artifacts for parent run
    run.parent.upload_file(name="./outputs/" + model_name,
                           path_or_stream=model_name)
    print("Uploaded the model {} to experiment {}".format(
        model_name, run.experiment.name))
    dirpath = os.getcwd()
    print(dirpath)
    print("Following files are uploaded ")
    print(run.parent.get_file_names())

    run.parent.tag("BuildId", value=build_id)

    # Add properties to identify this specific training run
    run.tag("BuildId", value=build_id)
    run.tag("run_type", value="train")
    builduri_base = os.environ.get("BUILDURI_BASE")
    if (builduri_base is not None):
        build_uri = builduri_base + build_id
        run.tag("BuildUri", value=build_uri)
        run.parent.tag("BuildUri", value=build_uri)
    print(f"tags now present for run: {run.tags}")

    run.complete()
Ejemplo n.º 14
0
import argparse
from azureml.core import Dataset, Run

parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str)
args = parser.parse_args()

run = Run.get_context()
ws = run.experiment.workspace

# get the input dataset by ID
dataset = Dataset.get_by_id(ws, id=args.input_data)

# load the TabularDataset to pandas DataFrame
df = dataset.to_pandas_dataframe()
Ejemplo n.º 15
0
def save_model(model, model_name, output_folder):
    output_path = output_folder + '/{}.pkl'.format(model_name)
    joblib.dump(value=model, filename=output_path)


def main(args, run, pd_dataset):
    logging.basicConfig(level=logging.INFO)
    pd_dataset = dataset.to_pandas_dataframe()
    X, y = pd_dataset.iloc[:, :-1], pd_dataset.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=args.test_size, random_state=args.random_state_test)
    model = train_decision_tree(args.max_leaf_nodes, args.random_state_model,
                                X_train, y_train)
    y_prediction = predict(model, X_test)
    model_accuracy = accuracy_score(y_true=y_test, y_pred=y_prediction)
    labels = y.unique()
    matrix = confusion_matrix(y_test, y_prediction, labels)
    save_model(model, args.model_name, args.output_folder)
    log_results(model_accuracy, args.output_folder, args.model_name, matrix,
                labels, run)


if __name__ == '__main__':
    run = Run.get_context()
    args = get_parsed_args()
    makedirs(args.output_folder, exist_ok=True)
    dataset = Dataset.get_by_name(run.experiment.workspace,
                                  name=args.dataset_name)
    log_arguments(args, dataset, run)
    main(args, run, dataset)
    run.complete()
def main():
    print("Running train_aml.py")

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_name",
        type=str,
        help="Name of the Model",
        default="COVID19Articles_model_github.pkl",
    )

    parser.add_argument("--step_output",
                        type=str,
                        help=("output for passing data to next step"))

    parser.add_argument("--dataset_version",
                        type=str,
                        help=("dataset version"))

    parser.add_argument("--data_file_path",
                        type=str,
                        help=("data file path, if specified,\
               a new version of the dataset will be registered"))

    parser.add_argument("--dataset_name",
                        type=str,
                        help=("Dataset name. Dataset must be passed by name\
              to always get the desired dataset version\
              rather than the one used while the pipeline creation"))

    args = parser.parse_args()

    print("Argument [model_name]: %s" % args.model_name)
    print("Argument [step_output]: %s" % args.step_output)
    print("Argument [dataset_version]: %s" % args.dataset_version)
    print("Argument [data_file_path]: %s" % args.data_file_path)
    print("Argument [dataset_name]: %s" % args.dataset_name)

    datastore_name = os.environ.get("DATASTORE_NAME")
    model_name = args.model_name
    step_output_path = args.step_output
    dataset_version = args.dataset_version
    data_file_path = args.data_file_path
    dataset_name = args.dataset_name

    run = Run.get_context()

    # Get the dataset
    if (dataset_name):
        if (data_file_path == ""):
            if (dataset_name
                    in Dataset.get_all(run.experiment.workspace).keys()):
                dataset = Dataset.get_by_name(run.experiment.workspace,
                                              dataset_name,
                                              version=dataset_version)
            else:
                create_sample_data_csv(run.experiment.workspace,
                                       datastore_name)
                dataset = register_dataset(run.experiment.workspace,
                                           dataset_name, datastore_name)
        else:
            dataset = register_dataset(run.experiment.workspace, dataset_name,
                                       datastore_name, data_file_path)
    else:
        if (data_file_path == ""):
            data_file_path = "COVID19Articles.csv"
            create_sample_data_csv(run.experiment.workspace, datastore_name)
        dataset_name = "COVID19Articles_Training_githubactions"
        dataset = register_dataset(run.experiment.workspace, dataset_name,
                                   datastore_name, data_file_path)

    # Link dataset to the step run so it is trackable in the UI
    run.input_datasets['training_data'] = dataset

    # Split the data into test/train
    df = dataset.to_pandas_dataframe()
    data = split_data(df)

    class_args = {"max_depth": 5}
    # Train the model
    model = train_model(data, class_args)

    # Evaluate and log the metrics returned from the train function
    metrics = get_model_metrics(model, data)
    for (k, v) in metrics.items():
        run.log(k, v)

    # files saved in the "outputs" folder are automatically uploaded into run history
    model_file_name = "COVID19Articles_model.pkl"
    joblib.dump(model, os.path.join('outputs', model_file_name))
    run.tag("run_type", value="train")
    print(f"tags now present for run: {run.tags}")

    run.complete()
Ejemplo n.º 17
0
argparser.add_argument('--n_layers', type=int, default=2)
argparser.add_argument('--learning_rate', type=float, default=0.01)
argparser.add_argument('--chunk_len', type=int, default=200)
argparser.add_argument('--batch_size', type=int, default=100)
argparser.add_argument('--shuffle', action='store_true')
argparser.add_argument('--cuda', action='store_true')
args = argparser.parse_args()

# TODO: Download the dataset you uploaded earlier by using
# the Dataset class using the 'get_by_name' and 'download' methods.
# Use the recieved file_path as input to the 'read_file()' function.
# HINT:
#   In the 'get_by_name' method the name input-field should be 'name=args.dataset'
#   The filepath is a list and 'read_file' expect a string
#   https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.dataset.dataset?view=azure-ml-py
dataset = Dataset.get_by_name(ws, name=args.dataset)
file_path = dataset.download(target_path='.', overwrite=True)

file, file_len = read_file(file_path[0])  # TODO: Input the file path here


# Splitting dataset function
def random_training_set(chunk_len, batch_size):
    inp = torch.LongTensor(batch_size, chunk_len)
    target = torch.LongTensor(batch_size, chunk_len)
    for bi in range(batch_size):
        start_index = random.randint(0, file_len - chunk_len)
        end_index = start_index + chunk_len + 1
        chunk = file[start_index:end_index]
        if len(chunk[:-1]) < 200: continue
        inp[bi] = char_tensor(chunk[:-1])
Ejemplo n.º 18
0
from azureml.core import Workspace, Dataset

subscription_id = 'REPLACE'
resource_group = 'REPLACE'
workspace_name = 'REPLACE'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='MSFT')
dataset = dataset.to_pandas_dataframe()

print(dataset.head())
print(dataset.dtypes)
Ejemplo n.º 19
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    # XGBClassifier args
    parser.add_argument("--eta", type=float, help="Learning rate for model")
    parser.add_argument("--max_depth", type=int, help="Depth for trees")
    parser.add_argument("--min_child_weight",
                        type=int,
                        help="Min child weight for tree")
    parser.add_argument(
        "--subsample",
        type=float,
        help="Subsample of training set used for each iteration")
    parser.add_argument("--colsample_bytree",
                        type=float,
                        help="Subsample of columns to use for each iteration")
    parser.add_argument(
        "--early_stopping_rounds",
        type=int,
        help=
        "Model will stop iterating if no improvement after set number of rounds"
    )
    parser.add_argument("--eval_metric",
                        type=str,
                        default="auc",
                        help="Metric for evaluation")
    parser.add_argument(
        "--scale_pos_weight",
        type=float,
        help="Control balance of positive and negative weights")
    parser.add_argument("--max_delta_step",
                        type=int,
                        help="Conservativeness of update step")
    parser.add_argument("--num_boost_rounds",
                        type=int,
                        help="Number of estimators ")
    # SGD args
    parser.add_argument("--alpha",
                        type=float,
                        default="linear",
                        help="Regularization strength")
    parser.add_argument("--l1_ratio",
                        type=float,
                        default=1.0,
                        help="l1_ratio in elasticnet penalty")

    # ExtraTreesClassifier args
    parser.add_argument("--n_estimators",
                        type=int,
                        help="Number of trees in the forest")
    parser.add_argument("--min_samples_split",
                        type=float,
                        help="Min number to split a node")
    parser.add_argument("--min_samples_leaf",
                        type=float,
                        help="Min number of samples at leaf node")
    parser.add_argument("--max_features",
                        type=float,
                        help="Number of features to consider for split")
    parser.add_argument("--ccp_alpha",
                        type=float,
                        help="Complexity parameter for pruning")

    args = parser.parse_args()

    # Retrieve datasets by name | Create train/val
    location = Dataset.get_by_name(workspace=workspace,
                                   name="cleaned_loan_dataset").download()
    print(location)

    train = pd.read_parquet(location[0])

    x_train, x_val = train_test_split(train,
                                      test_size=0.3,
                                      stratify=train.default_status,
                                      random_state=20)
    y_train, y_val = x_train.pop("default_status"), x_val.pop("default_status")

    # SVM Classifier
    scaler = StandardScaler()
    sgd = SGDClassifier(alpha=args.alpha,
                        l1_ratio=args.l1_ratio,
                        penalty="elasticnet",
                        loss="modified_huber",
                        class_weight="balanced",
                        early_stopping=True)
    sgd_clf = make_pipeline(scaler, sgd)

    # ExtraTreesClassifier
    etc_clf = ExtraTreesClassifier(n_estimators=args.n_estimators,
                                   min_samples_split=args.min_samples_split,
                                   min_samples_leaf=args.min_samples_leaf,
                                   max_features=args.max_features,
                                   ccp_alpha=args.ccp_alpha,
                                   class_weight="balanced")

    # XGBoost
    xgb_clf = XGBClassifier(objective="binary:logistic",
                            n_estimators=args.num_boost_rounds,
                            max_depth=args.max_depth,
                            min_child_weight=args.min_child_weight,
                            learning_rate=args.eta,
                            subsample=args.subsample,
                            colsample_bytree=args.colsample_bytree,
                            eval_metric=args.eval_metric,
                            scale_pos_weight=args.scale_pos_weight,
                            max_delta_step=args.max_delta_step)

    # VotingClassifier
    model = VotingClassifier(estimators=[("sgd", sgd_clf), ("etc", etc_clf),
                                         ("xgb", xgb_clf)],
                             voting="soft")

    model.fit(x_train, y_train)

    # Make prediction on Val dataset & log AUC
    y_pred = model.predict(x_val)
    auc_score = roc_auc_score(y_val, y_pred, average="weighted")
    run.log("auc", float(auc_score))

    print("Classification Report: \n", classification_report(y_val, y_pred))

    # Dump model artifact
    os.makedirs('outputs', exist_ok=True)
    joblib.dump(model, "outputs/model_voting.joblib")
    dest="target_column_name",
    help="Target Column Name",
)
parser.add_argument(
    "--test_dataset", type=str, dest="test_dataset", help="Test Dataset"
)

args = parser.parse_args()
target_column_name = args.target_column_name
test_dataset_id = args.test_dataset

run = Run.get_context()
ws = run.experiment.workspace

# get the input dataset by id
test_dataset = Dataset.get_by_id(ws, id=test_dataset_id)

X_test = (
    test_dataset.drop_columns(columns=[target_column_name])
    .to_pandas_dataframe()
    .reset_index(drop=True)
)
y_test_df = (
    test_dataset.with_timestamp_columns(None)
    .keep_columns(columns=[target_column_name])
    .to_pandas_dataframe()
)

# generate forecast
fitted_model = joblib.load("model.pkl")
# We have default quantiles values set as below(95th percentile)
Ejemplo n.º 21
0
from azureml.core import Workspace, Dataset, Experiment, Run
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.train.dnn import TensorFlow
from azureml.widgets import RunDetails
import os
from utils import get_workspace

ws = get_workspace()
cluster_name = "bbacompute"
dataset_name = "bearing_dataset"

dataset = Dataset.get_by_name(ws, dataset_name)

try:
    cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print("cluster exist: ", cluster_name)
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size="standard_d12_v2", max_nodes=1)
    cluster = ComputeTarget.create(ws, cluster_name, compute_config)
cluster.wait_for_completion(show_output=True)

exp_name = "exp_bearing_anomaly_lstm"
experiment = Experiment(ws, name=exp_name)

estimator = TensorFlow(
        source_directory='.', 
        entry_script='lstm.py', 
        script_params={'--run_at': 'remote'},
        inputs=[dataset.as_named_input('bearingdata')],
        compute_target=cluster, 
Ejemplo n.º 22
0
def main():
    print("Running train_aml.py")

    parser = argparse.ArgumentParser("train")
    parser.add_argument(
        "--model_name",
        type=str,
        help="Name of the Model",
        default="insure_model_model.pkl",
    )

    parser.add_argument("--step_output",
                        type=str,
                        help=("output for passing data to next step"))

    parser.add_argument("--dataset_version",
                        type=str,
                        help=("dataset version"))

    parser.add_argument("--data_file_path",
                        type=str,
                        help=("data file path, if specified,\
               a new version of the dataset will be registered"))

    parser.add_argument(
        "--caller_run_id",
        type=str,
        help=("caller run id, for example ADF pipeline run id"))

    parser.add_argument("--dataset_name",
                        type=str,
                        help=("Dataset name. Dataset must be passed by name\
              to always get the desired dataset version\
              rather than the one used while the pipeline creation"))

    args = parser.parse_args()

    print("Argument [model_name]: %s" % args.model_name)
    print("Argument [step_output]: %s" % args.step_output)
    print("Argument [dataset_version]: %s" % args.dataset_version)
    print("Argument [data_file_path]: %s" % args.data_file_path)
    print("Argument [caller_run_id]: %s" % args.caller_run_id)
    print("Argument [dataset_name]: %s" % args.dataset_name)

    model_name = args.model_name
    step_output_path = args.step_output
    dataset_version = args.dataset_version
    data_file_path = args.data_file_path
    dataset_name = args.dataset_name

    run = Run.get_context()

    print("Getting training parameters")

    # Load the training parameters from the parameters file
    with open("parameters.json") as f:
        pars = json.load(f)
    try:
        train_args = pars["training"]
    except KeyError:
        print("Could not load training values from file")
        train_args = {}

    # Log the training parameters
    print(f"Parameters: {train_args}")
    for (k, v) in train_args.items():
        run.log(k, v)
        run.parent.log(k, v)

    # Get the dataset
    if (dataset_name):
        if (data_file_path == 'none'):
            dataset = Dataset.get_by_name(run.experiment.workspace,
                                          dataset_name,
                                          dataset_version)  # NOQA: E402, E501
        else:
            dataset = register_dataset(run.experiment.workspace, dataset_name,
                                       os.environ.get("DATASTORE_NAME"),
                                       data_file_path)
    else:
        e = ("No dataset provided")
        print(e)
        raise Exception(e)

    # Link dataset to the step run so it is trackable in the UI
    run.input_datasets['training_data'] = dataset
    run.parent.tag("dataset_id", value=dataset.id)

    # Split the data into test/train
    df = dataset.to_pandas_dataframe()
    data = split_data(df)

    # Train the model
    model = train_model(data, train_args)

    # Evaluate and log the metrics returned from the train function
    metrics = get_model_metrics(model, data[1])
    for (k, v) in metrics.items():
        run.log(k, v)
        run.parent.log(k, v)

    # Pass model file to next step
    os.makedirs(step_output_path, exist_ok=True)
    model_output_path = os.path.join(step_output_path, model_name)
    joblib.dump(value=model, filename=model_output_path)

    # Also upload model file to run outputs for history
    os.makedirs('outputs', exist_ok=True)
    output_path = os.path.join('outputs', model_name)
    joblib.dump(value=model, filename=output_path)

    run.tag("run_type", value="train")
    print(f"tags now present for run: {run.tags}")

    run.complete()
Ejemplo n.º 23
0
from sklearn.metrics import accuracy_score
from azureml.core import Workspace, Datastore, Dataset, Run
from azureml.core.resource_configuration import ResourceConfiguration
from azureml.core.model import Model
import pandas as pd
import os
import joblib
import sklearn

# get the current run
run = Run.get_context()
ws = run.experiment.workspace
datastore = ws.get_default_datastore()

# get the dataset
ds = Dataset.get_by_name(ws, "diabetes_cleaned")
diabetes_df = ds.to_pandas_dataframe()

X = diabetes_df.drop("class", axis=1)
y = diabetes_df["class"]

# split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=diabetes_df["class"], random_state=0)

# init the model
model = KNeighborsClassifier()

# train the model
model.fit(X_train, y_train)
Ejemplo n.º 24
0
                            overwrite=True,
                            show_progress=False)

    # Register dataset

    path_on_datastore = os.path.join(target_path, file_name)
    dataset = Dataset.Tabular.from_delimited_files(path=(default_ds,
                                                         path_on_datastore))
    dataset = dataset.register(workspace=aml_workspace,
                               name=dataset_name,
                               description='diabetes training data',
                               tags={'format': 'CSV'},
                               create_new_version=True)

#Get the dataset
dataset = Dataset.get_by_name(aml_workspace, dataset_name)

#Create a PipelineData to pass data between steps
pipeline_data = PipelineData('pipeline_data',
                             datastore=aml_workspace.get_default_datastore())

#Configure step for training model
train_model = PythonScriptStep(
    name="Train Model",
    script_name=variables["TRAIN_SCRIPT_PATH"],
    compute_target=compute_target,
    runconfig=run_config,
    inputs=[dataset.as_named_input('training_data')],
    outputs=[pipeline_data],
    allow_reuse=False,
    arguments=["--step_output", pipeline_data])
Ejemplo n.º 25
0
args = parser.parse_args()
target_column_name = args.target_column_name
model_name = args.model_name

print("args passed are: ")
print("Target column name: ", target_column_name)
print("Name of registered model: ", model_name)

model_path = Model.get_model_path(model_name)
# deserialize the model file back into a sklearn model
model = joblib.load(model_path)

run = Run.get_context()

test_dataset = Dataset.get_by_id(run.experiment.workspace, id=args.input_data)

X_test_df = test_dataset.drop_columns(
    columns=[target_column_name]).to_pandas_dataframe()
y_test_df = (test_dataset.with_timestamp_columns(None).keep_columns(
    columns=[target_column_name]).to_pandas_dataframe())

predicted = model.predict_proba(X_test_df)

if isinstance(predicted, pd.DataFrame):
    predicted = predicted.values

# Use the AutoML scoring module
train_labels = model.classes_
class_labels = np.unique(
    np.concatenate((y_test_df.values, np.reshape(train_labels, (-1, 1)))))
from azureml.core import Dataset, Run
from load_data import train_test_bert, prep_train_test_bert
import random

random.seed(4)

run = Run.get_context()
workspace = run.experiment.workspace

dataset = Dataset.get_by_name(workspace=workspace, name='dataset')
dataset.download(target_path='.', overwrite=False)

dist = Dataset.get_by_name(workspace=workspace, name='dataset_dist')
dist.download(target_path='.', overwrite=False)

def bert_precompute():
    prep_train_test_bert('./media.csv', './dist.dat', './models/1024dRoBertAModel',
            10, result_path='./result1024dRoBertA.txt', check=1,
            pretrained_weights='roberta-base')

def train_mlp():
    os.makedirs(os.path.dirname('./outputs/'), exist_ok=True)
    precalced = Dataset.get_by_name(workspace, name='distilbert-base-uncased_pack')
    precalced.download(target_path='./outputs/', overwrite=False)
    train_test_bert('./media.csv', './dist.dat', './models/768dBertModel',
                    10, result_path='./result768dBert.txt', check=1,
                    pretrained_weights='distilbert-base-uncased')


train_mlp()
Ejemplo n.º 27
0
parser.add_argument('--learning_rate', type=float, dest='learning_rate', default=1e-5)
parser.add_argument('--adam_epsilon', type=float, dest='adam_epsilon', default=1e-8)
parser.add_argument('--num_epochs', type=int, dest='num_epochs', default=5)

args = parser.parse_args()

dataset_name = args.dataset_name
batch_size = args.batch_size
learning_rate = args.learning_rate
adam_epsilon = args.adam_epsilon
num_epochs = args.num_epochs

run = Run.get_context()
workspace = run.experiment.workspace

dataset = Dataset.get_by_name(workspace, name=dataset_name)
file_name = dataset.download()[0]
df = pd.read_csv(file_name)

label_counts = pd.DataFrame(df['Product'].value_counts())
label_values = list(label_counts.index)
order = list(pd.DataFrame(df['Product_Label'].value_counts()).index)
label_values = [l for _,l in sorted(zip(order, label_values))]

texts = df['Complaint'].values
labels = df['Product_Label'].values

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

text_ids = [tokenizer.encode(text, max_length=300, pad_to_max_length=True) for text in texts]
from sklearn.externals import joblib
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn_pandas import DataFrameMapper
import os
import pandas as pd
import shutil

from azureml.core import Run, Dataset, Workspace

ws = Run.get_context().experiment.workspace
os.makedirs('./outputs', exist_ok=True)

attritionData = Dataset.get_by_name(ws,'employeeattrition').to_pandas_dataframe()

# Dropping Employee count as all values are 1 and hence attrition is independent of this feature
attritionData = attritionData.drop(['EmployeeCount'], axis=1)
# Dropping Employee Number since it is merely an identifier
attritionData = attritionData.drop(['EmployeeNumber'], axis=1)
attritionData = attritionData.drop(['Over18'], axis=1)
# Since all values are 80
attritionData = attritionData.drop(['StandardHours'], axis=1)

attritionData["Attrition_numerical"] = attritionData["Attrition"]
target = attritionData["Attrition_numerical"]

attritionXData = attritionData.drop(['Attrition_numerical', 'Attrition'], axis=1)

# Creating dummy columns for each categorical feature
def parse_id_to_dataset(dataset_id):
    run = Run.get_context()
    ws = run.experiment.workspace
    return Dataset.get_by_id(ws, id=dataset_id)
Ejemplo n.º 30
0
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
#from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
#from sklearn.model_selection import TimeSeriesSplit
#from statsmodels.graphics.gofplots import qqplot
from statsmodels.tsa.arima_model import ARIMA

from azureml.core import Dataset, Run

run = Run.get_context()
# get input dataset by name
#dataset = run.input_datasets['transaction_ts']

ws = run.experiment.workspace
dataset1 = Dataset.get_by_name(workspace=ws, name='transaction_ts2013')
df = dataset1.to_pandas_dataframe()

df.set_index('TransactionDate', inplace=True)
df.columns = ['PaidAmount']
series = pd.Series(df['PaidAmount'])


def mean_and_variance(X):
    split = int(len(X) / 2)
    X1, X2 = X[0:split], X[split:]
    mean1, mean2 = X1.mean(), X2.mean()
    var1, var2 = X1.var(), X2.var()
    print('mean1=%f, mean2=%f' % (mean1, mean2))
    print('variance1=%f, variance2=%f' % (var1, var2))