Ejemplo n.º 1
0
            def register_dataset(dataset_name, dataframe):
                dataset_config = next(
                    iter(
                        filter(lambda x: x["name"] == dataset_name,
                               self.output_reg_datasets)))

                datastore = dataset_config.get("datastore") or "default"
                description = dataset_config.get("description")
                tags = dataset_config.get("tags")

                if datastore == "default":
                    ds = ws.get_default_datastore()
                else:
                    ds = Datastore.get(workspace=ws, datastore_name=datastore)

                target_path = f'experiment/{run.experiment.name}/run/{run.number}/out/{dataset_name}'

                default_output_dataset_tags = {
                    "format": self.
                    OUTPUT_FORMAT,  # Dataset.Tabular.register_pandas_dataframe always writes a parquet
                    "experiment": run.experiment.name,
                    "run": run.number
                }

                output_dataset_tags = {**default_output_dataset_tags, **tags}

                Dataset.Tabular.register_pandas_dataframe(
                    dataframe,
                    target=(ds, target_path),
                    name=dataset_name,
                    description=description,
                    tags=output_dataset_tags)
Ejemplo n.º 2
0
def get_blob_datastore(workspace: Workspace, data_store_name: str,
                       storage_name: str, storage_key: str,
                       container_name: str):
    """
    Returns a reference to a datastore
    Parameters:
      workspace (Workspace): existing AzureML Workspace object
      data_store_name (string): data store name
      storage_name (string): blob storage account name
      storage_key (string): blob storage account key
      container_name (string): container name
    Returns:
        Datastore: a reference to datastore
    """
    try:
        blob_datastore = Datastore.get(workspace, data_store_name)
        print("Found Blob Datastore with name: %s", data_store_name)
    except HttpOperationError:
        blob_datastore = Datastore.register_azure_blob_container(
            workspace=workspace,
            datastore_name=data_store_name,
            account_name=storage_name,  # Storage account name
            container_name=container_name,  # Name of Azure blob container
            account_key=storage_key)  # Storage account key
    print("Registered blob datastore with name: %s", data_store_name)
    return blob_datastore
Ejemplo n.º 3
0
    def get_ds_object(ws, name):
        """
        get_ds_object - Get workspace datastore object

        :param str ws: workspace
        :param str name: data store name

        :returns: ws, name
        :rtype: blob object, str

       """
        return Datastore.get(ws, name)
Ejemplo n.º 4
0
def register_dataset(path, aml_interface, storage_acct_name, storage_acct_key):
    workspace = aml_interface.workspace
    datastore = Datastore.register_azure_blob_container(
        workspace=workspace,
        datastore_name='prediction',
        container_name='prediction',
        account_name=storage_acct_name,
        account_key=storage_acct_key)

    prediction_datastore = Datastore.get(workspace, 'prediction')
    datastore_path = [(prediction_datastore, path)]
    dataset = Dataset.Tabular.from_delimited_files(path=datastore_path)
    dataset = dataset.register(workspace=aml_interface.workspace,
                               name='Prediction')
Ejemplo n.º 5
0
def get_or_create_dataset(azure_config: AzureConfig,
                          azure_dataset_id: str) -> Dataset:
    """
    Looks in the AzureML datastore for a dataset of the given name. If there is no such dataset, a dataset is created
    and registered, assuming that the files are in a folder that has the same name as the dataset. For example, if
    azure_dataset_id is 'foo', then the 'foo' dataset is pointing to <container_root>/datasets/foo folder.

    WARNING: the behaviour of Dataset.File.from_files, used below, is idiosyncratic. For example,
    if "mydataset" storage has two "foo..." subdirectories each containing
    a file dataset.csv and a directory ABC,

    datastore = Datastore.get(workspace, "mydataset")
    # This dataset has the file(s) in foo-bar01 at top level, e.g. dataset.csv
    ds1 = Dataset.File.from_files([(datastore, "foo-bar01/*")])
    # This dataset has two directories at top level, each with a name matching foo-bar*, and each
    # containing dataset.csv.
    ds2 = Dataset.File.from_files([(datastore, "foo-bar*/*")])
    # This dataset contains a single directory "mydataset" at top level, containing a subdirectory
    # foo-bar01, containing dataset.csv and (part of) ABC.
    ds3 = Dataset.File.from_files([(datastore, "foo-bar01/*"),
                                   (datastore, "foo-bar01/ABC/abc_files/*/*.nii.gz")])

    These behaviours can be verified by calling "ds.download()" on each dataset ds.
    """
    if not azure_config.azureml_datastore:
        raise ValueError(
            "No value set for 'azureml_datastore' (name of the datastore in the AzureML workspace)"
        )
    logging.info(
        f"Retrieving datastore '{azure_config.azureml_datastore}' from AzureML workspace"
    )
    workspace = azure_config.get_workspace()
    datastore = Datastore.get(workspace, azure_config.azureml_datastore)
    try:
        logging.info(
            f"Trying to retrieve AzureML Dataset '{azure_dataset_id}'")
        azureml_dataset = Dataset.get_by_name(workspace, name=azure_dataset_id)
        logging.info("Dataset found.")
    except:
        logging.info(
            f"Dataset does not yet exist, creating a new one from data in folder '{azure_dataset_id}'"
        )
        # See WARNING above before changing the from_files call!
        azureml_dataset = Dataset.File.from_files([(datastore,
                                                    azure_dataset_id)])
        logging.info("Registering the dataset for future use.")
        azureml_dataset.register(workspace, name=azure_dataset_id)
    return azureml_dataset
Ejemplo n.º 6
0
    def config(ws, blob_datastore_name, account_name, container_name,
               account_key):

        try:
            blob_datastore = Datastore.get(ws, blob_datastore_name)
            print("Found Blob Datastore with name: %s" % blob_datastore_name)
        except HttpOperationError:
            blob_datastore = Datastore.register_azure_blob_container(
                workspace=ws,
                datastore_name=blob_datastore_name,
                account_name=account_name,  # Storage account name
                container_name=container_name,  # Name of Azure blob container
                account_key=account_key)  # Storage account key
            print("Registered blob datastore with name: %s" %
                  blob_datastore_name)

        return blob_datastore
Ejemplo n.º 7
0
def createDataReference(workspace, storage_name, storage_key,
                        storage_container_name, data_store_name,
                        data_reference_name):
    '''
        If no present, registers a new azureml.core.datastore.Datastore
        Once the data store is in hand it creates an instance of azureml.data.data_reference.DataReference that 
        can be used in an Azure ML pipeline step. 

        PARAMS: 
            workspace               : azureml.core.Workspace    : Existing AMLS Workspace
            storage_name            : string                    : Name of the Azure Storage Account
            storage_key             : string                    : Access Key to the Azure Storage Account
            storage_container_name  : string                    : Container name to recieve blobs. Must exist
            data_store_name         : string                    : Name of the registere data store.
            data_reference_name     : string                    : Name of the data reference

        RETURNS: 
            tuple(azureml.core.datastore.Datastore, azureml.data.data_reference.DataReference)

    '''
    data_store = None

    try:
        data_store = Datastore.get(workspace, data_store_name)
        print("Found existing data store - ", data_store_name)
    except Exception as ex:
        print("Creating data store - ", data_store_name)

        data_store = Datastore.register_azure_blob_container(
            workspace,
            datastore_name=data_store_name,
            container_name=storage_container_name,
            account_name=storage_name,
            account_key=storage_key,
        )

    if data_store == None:
        raise Exception("Could not create/find data store.")

    return data_store, DataReference(datastore=data_store,
                                     data_reference_name=data_reference_name)
Ejemplo n.º 8
0
if 'creditcard' not in ws.datasets:

    #Set blobdatastore
    blob_datastore_name = 'MyBlobDatastore'
    account_name = os.getenv(
        "BLOB_ACCOUNTNAME_62",
        "PUT YOUR STORAGE ACCOUNT NAME HERE")  # Storage account name
    container_name = os.getenv(
        "BLOB_CONTAINER_62",
        "PUT YOUR STORAGE CONTAINER NAME HERE")  # Name of Azure blob container
    account_key = os.getenv(
        "BLOB_ACCOUNT_KEY_62",
        "PUT YOUR STORAGE ACCOUNT KEY HERE")  # Storage account key

    try:
        blob_datastore = Datastore.get(ws, blob_datastore_name)
        print("Found Blob Datastore with name: %s" % blob_datastore_name)
    except:
        blob_datastore = Datastore.register_azure_blob_container(
            workspace=ws,
            datastore_name=blob_datastore_name,
            account_name=account_name,  # Storage account name
            container_name=container_name,  # Name of Azure blob container
            account_key=account_key)  # Storage account key
        print("Registered blob datastore with name: %s" % blob_datastore_name)

    blob_data_ref = DataReference(datastore=blob_datastore,
                                  data_reference_name="blob_test_data",
                                  path_on_datastore="testdata")
    csv_path = (blob_datastore, '/creditcard.csv')
Ejemplo n.º 9
0
import argparse
from pathlib import Path

from azureml.core.datastore import Datastore
from azureml.core.workspace import Workspace

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--subscription-id", type=str)
    parser.add_argument("--resource-group", type=str)
    parser.add_argument("--workspace-name", type=str)
    parser.add_argument("--datastore-name", type=str)
    parser.add_argument("--data-directory", type=str)
    parser.add_argument("--dataset-name", type=str)
    args = parser.parse_args()

    print(args.workspace_name)
    workspace = Workspace(
        subscription_id=args.subscription_id,
        resource_group=args.resource_group,
        workspace_name=args.workspace_name,
    )
    datastore = Datastore.get(workspace, args.datastore_name)
    local_path = Path(args.data_directory)
    for phase in ["train", "val"]:
        local_directory = str(local_path / phase)
        target_path = str(Path(args.dataset_name) / phase)
        datastore.upload(local_directory,
                         target_path=target_path,
                         show_progress=True)
Ejemplo n.º 10
0
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('No compute cluster named {}'.format(cluster_name))
    exit()

curated_env_name = 'Resnet50v15-CPU-cluster'
# pytorch_env = Environment.get(workspace=ws, name=curated_env_name)
pytorch_env = Environment.from_conda_specification(
    name=curated_env_name, file_path='./conda_dependencies.yml')

project_folder = './'
data_path = 'datasets'

datastore = Datastore.get(ws, 'workspaceblobstore')
dataset = Dataset.File.from_files(path=(datastore, data_path))
data_loc = dataset.as_named_input('input').as_mount()

src = ScriptRunConfig(
    source_directory=project_folder,
    # command=['ls'],
    script='train_resnet.py',
    arguments=[
        '--num_epochs',
        16,
        '--batch',
        '32',
        '--shuffle',
        'True',
        '--dataloc',
Ejemplo n.º 11
0
import os
from pathlib import Path
from utils import get_workspace

raw_data_dir = "C:\\Dataspace\\IMS\\2nd_test"
prep_data_dir = "C:\\Dataspace\\IMS\\processed\\2nd_test"
datastore_name = "bearing_datastore"
dataset_name = "bearing_dataset"
container_name = "bearingdata"

sensor_data = pd.DataFrame()

ws = get_workspace()

try:
    datastore = Datastore.get(ws, datastore_name)
    print("Datastore found: ", datastore_name)
except Exception:
    datastore = Datastore.register_azure_blob_container(
        workspace=ws,
        datastore_name=datastore_name,
        account_name=os.environ.get('AML_BLOB_ACCOUNT_NAME'),
        container_name=container_name,
        account_key=os.environ.get('AML_BLOB_ACCOUNT_KEY'),
        endpoint="core.chinacloudapi.cn")
    print("Datastore registered: ", datastore_name)

for filename in os.listdir(raw_data_dir):
    data = pd.read_csv(os.path.join(raw_data_dir, filename),
                       names=["c1", "c2", "c3", "c4"],
                       sep='\t')
Ejemplo n.º 12
0
                                                             datastore_name=blob_datastore_name,
                                                             account_name=account_name, # ADLS Gen2 account name
                                                             container_name ='nyctaxi', # ADLS Gen2 filesystem
                                                             account_key = account_key)

# COMMAND ----------

print(dsNYCTaxi.datastore_type)

# COMMAND ----------

from azureml.pipeline.core import PipelineParameter,Pipeline, PipelineData
from azureml.data.data_reference import DataReference

# Use the default blob storage
dsNYCTaxi = Datastore.get(ws, "dsblob")
print('Datastore {} will be used'.format(dsNYCTaxi.name))

#pipeline_param = PipelineParameter(name="my_pipeline_param", default_value="pipeline_param1")


datasetFilePath = DataReference(datastore=dsNYCTaxi, path_on_datastore="/merged_aml_dbr",
                                     data_reference_name="datasetFilePath")

output = PipelineData("output", datastore=dsNYCTaxi)

# COMMAND ----------

from azureml.pipeline.steps import DatabricksStep
from azureml.core.databricks import PyPiLibrary