Example #1
0
# Specify the run configuration
run_config = RunConfiguration()
run_config.environment.docker.enabled = True
run_config.environment.python.conda_dependencies = cd

# Pipeline definition
inputdata = DataReference(
    datastore=Datastore.get(ws, "trainingdata"),
    data_reference_name="data"
)

train_model = PythonScriptStep(
    script_name="./train.py",
    name="fit-nlp-model",
    inputs=[inputdata.as_download(path_on_compute="./data")],
    runconfig=run_config,
    compute_target=compute_target,
)

pipeline = Pipeline(
    workspace=ws,
    steps=[train_model],
    description="Builds Keras model for detecting component defects",
)

if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("--publish", action="store_true")
Example #2
0
class AbstractAzureStorageDatastore(AbstractDatastore):
    """Represents the base class for datastores that save connection information to Azure Blob and Azure File storage.

    You should not work with this class directly. To create a datastore, use one of the ``register*`` methods
    of the Datastore class, for example, :meth:`azureml.core.datastore.Datastore.register_azure_blob_container`.

    Note: When using a datastore to access data, you must have permission to access the data, which depends on the
    credentials registered with the datastore.

    :param workspace: The workspace this datastore belongs to.
    :type workspace: azureml.core.workspace.Workspace
    :param name: The name of the datastore. It can only contain alphanumeric
        characters or - or _.
    :type name: str
    :param datastore_type: The type of this datastore, either "AzureBlob" or "AzureFile".
    :type datastore_type: str
    :param container_name: The container name.
    :type container_name: str
    :param account_name: The storage account name.
    :type account_name: str
    :param sas_token: The SAS token for accessing this container, defaults to None.
    :type sas_token: str, optional
    :param account_key: The storage account key, defaults to None.
    :type account_key: str, optional
    :param protocol: The protocol to use to connect to the storage account.
        If None, defaults to https.
    :type protocol: str, optional
    :param endpoint: The endpoint of the blob container. If None, defaults to core.windows.net.
    :type endpoint: str, optional
    """

    __metaclass__ = ABCMeta

    _sanitize_regex = re.compile(r"^(\.*[/\\])*")

    def __init__(self,
                 workspace,
                 name,
                 datastore_type,
                 container_name,
                 account_name,
                 sas_token=None,
                 account_key=None,
                 protocol=None,
                 endpoint=None):
        """Class AbstractAzureStorageDatastore constructor.

        :param workspace: The workspace this datastore belongs to.
        :type workspace: azureml.core.workspace.Workspace
        :param name: The name of the datastore. It can only contain alphanumeric
            characters or - or _.
        :type name: str
        :param datastore_type: The type of this datastore, either "AzureBlob" or "AzureFile".
        :type datastore_type: str
        :param container_name: The container name.
        :type container_name: str
        :param account_name: The storage account name.
        :type account_name: str
        :param sas_token: The SAS token for accessing this container, defaults to None.
        :type sas_token: str, optional
        :param account_key: The storage account key, defaults to None.
        :type account_key: str, optional
        :param protocol: The protocol to use to connect to the storage account.
            If None, defaults to https.
        :type protocol: str, optional
        :param endpoint: The endpoint of the blob container. If None, defaults to core.windows.net.
        :type endpoint: str, optional
        """
        super(AbstractAzureStorageDatastore,
              self).__init__(workspace, name, datastore_type)
        self.container_name = container_name
        self.account_name = account_name
        self.sas_token = sas_token
        self.account_key = account_key
        self.credential_type = 'None'
        self.protocol = protocol
        self.endpoint = endpoint

        if account_key:
            self.credential_type = 'AccountKey'
        if sas_token:
            self.credential_type = 'Sas'

        self._num_workers = 32

        self._data_reference = DataReference(datastore=self)

    def __repr__(self):
        """Return the string representation of the AbstractAzureStorageDatastore object.

        :return: String representation of the AbstractAzureStorageDatastore object
        :rtype: str
        """
        content = collections.OrderedDict()

        content['name'] = self.name
        content['container_name'] = self.container_name
        content['account_name'] = self.account_name
        content['protocol'] = self.protocol
        content['endpoint'] = self.endpoint

        return json.dumps(content, indent=2)

    def __str__(self):
        """Return the string representation of the AbstractAzureStorageDatastore object.

        :return: String representation of the AbstractAzureStorageDatastore object
        :rtype: str
        """
        return self.__repr__()

    def path(self, path=None, data_reference_name=None):
        """Return corresponding data reference object.

        :param path: The relative path on the datastore.
        :type path: str
        :param data_reference_name: The name of the data reference.
        :type data_reference_name: str
        :return: The data reference object.
        :rtype: azureml.data.data_reference.DataReference
        """
        return self._data_reference.path(path, data_reference_name)

    def as_download(self, path_on_compute=None):
        """Return data reference object with download mode.

        :param path_on_compute: The relative path on the compute.
        :type path_on_compute: str
        :return: The data reference object.
        :rtype: azureml.data.data_reference.DataReference
        """
        return self._data_reference.as_download(path_on_compute)

    def as_upload(self, path_on_compute=None):
        """Return data reference object with upload mode.

        :param path_on_compute: The relative path on the compute.
        :type path_on_compute: str
        :return: The data reference object.
        :rtype: azureml.data.data_reference.DataReference
        """
        return self._data_reference.as_upload(path_on_compute)

    def as_mount(self):
        """Return data reference object with mount mode.

        :param path_on_compute: The relative path on the compute.
        :type path_on_compute: str
        :return: The data reference object.
        :rtype: azureml.data.data_reference.DataReference
        """
        return self._data_reference.as_mount()

    @abstractmethod
    def download(self,
                 target_path,
                 prefix=None,
                 overwrite=False,
                 show_progress=True):
        """Download paths with prefix to target_path.

        :param target_path:
        :param prefix:
        :param overwrite:
        :param show_progress:
        :return:
        """
        raise NotImplementedError()

    @abstractmethod
    def upload(self,
               src_dir,
               target_path=None,
               overwrite=False,
               show_progress=True):
        """Upload src_dir to target_path.

        :param src_dir:
        :param target_path:
        :param overwrite:
        :param show_progress:
        :return:
        """
        raise NotImplementedError()

    @abstractmethod
    def upload_files(self,
                     files,
                     relative_root=None,
                     target_path=None,
                     overwrite=False,
                     show_progress=True):
        """Upload files to target_path.

        :param files:
        :param relative_root: relative path in target
        :param target_path:
        :param overwrite:
        :param show_progress:
        :return:
        """
        raise NotImplementedError()

    def _get_data_reference(self):
        return self._data_reference

    @property
    def is_sas(self):
        """Deprecated: use ``credential_type`` property.

        This property is deprecated, please use the property ``credential_type`` to determine the credential type.
        """
        module_logger.warning(
            "This property is deprecated, please use the property \"credential_type\""
            + " to determine the credential type.")
        return bool(self.sas_token)

    def _as_dict(self, hide_secret=True):
        output = super(AbstractAzureStorageDatastore, self)._as_dict()
        output["container_name"] = self.container_name
        output["account_name"] = self.account_name
        output["protocol"] = self.protocol
        output["endpoint"] = self.endpoint

        if not hide_secret:
            output["credential_type"] = self.credential_type
            output["sas_token"] = self.sas_token
            output["account_key"] = self.account_key

        return output

    def _get_default_request_session(self):
        a = requests.adapters.HTTPAdapter(pool_connections=self._num_workers,
                                          pool_maxsize=self._num_workers * 2,
                                          max_retries=create_retry())
        s = requests.Session()
        s.mount("http://", a)
        s.mount("https://", a)
        return s

    def _get_upload_from_dir(self, src_path, target_path):
        src_path = src_path.rstrip("/\\")
        if not os.path.isdir(src_path):
            raise UserErrorException("src_path must be a directory.")

        paths_to_upload = []
        for dirpath, dirnames, filenames in os.walk(src_path):
            paths_to_upload += self._get_upload_from_files(
                map(lambda f: os.path.join(dirpath, f), filenames),
                target_path, src_path, True)
        return paths_to_upload

    def _get_upload_from_files(self, file_paths, target_path, relative_root,
                               skip_root_check):
        paths_to_upload = []
        target_path = AbstractAzureStorageDatastore._sanitize_target_path(
            target_path)
        for file_path in file_paths:
            if not skip_root_check and relative_root not in file_path and relative_root != "/":
                raise UserErrorException(
                    "relative_root: '{}' is not part of the file_path: '{}'.".
                    format(relative_root, file_path))
            if not os.path.isfile(file_path):
                err_msg = "'{}' does not point to a file. " + \
                    "Please upload the file to cloud first if running in a cloud notebook."
                raise UserErrorException(err_msg.format(file_path))

            target_file_path = to_unix_path(file_path)
            if relative_root != "/":
                # need to do this because Windows doesn't support relpath if the partition is different
                target_file_path = os.path.relpath(target_file_path,
                                                   to_unix_path(relative_root))
            else:
                # strip away / otherwise we will create a folder in the container with no name
                target_file_path = target_file_path.lstrip("/")

            if target_path:
                target_file_path = os.path.join(target_path, target_file_path)

            paths_to_upload.append((file_path, target_file_path))

        return paths_to_upload

    @staticmethod
    def _sanitize_target_path(target_path):
        if not target_path:
            return target_path
        return AbstractAzureStorageDatastore._sanitize_regex.sub(
            "", target_path)

    def _start_upload_task(self, paths_to_upload, overwrite, exists,
                           show_progress, task_generator):
        # it's an estimated total because we might skip some files
        estimated_total = len(paths_to_upload)
        counter = _Counter()
        console = self._get_progress_logger(show_progress, module_logger)

        console("Uploading an estimated of {} files".format(estimated_total))

        def exception_handler(e, logger):
            logger.error(
                "Upload failed, please make sure target_path does not start with invalid characters.",
                e)

        with TaskQueue(flush_timeout_seconds=float('inf'),
                       _ident=__name__,
                       _parent_logger=module_logger) as tq:
            for (src_file_path, target_file_path) in paths_to_upload:
                if not overwrite:
                    if exists(target_file_path):
                        estimated_total -= 1
                        console(
                            "Target already exists. Skipping upload for {}".
                            format(target_file_path))
                        continue

                task_fn = task_generator(target_file_path, src_file_path)
                future_handler = self._get_task_handler(
                    src_file_path, counter, estimated_total, show_progress,
                    "Upload", exception_handler)
                future = tq.create_future(task_fn)
                async_task = AsyncTask(
                    future,
                    handler=future_handler,
                    _ident="task_upload_{}".format(target_file_path),
                    _parent_logger=module_logger)
                tq.add_task(async_task)

        console("Uploaded {} files".format(counter.count()))
        return counter.count()

    def _get_task_handler(self,
                          f,
                          counter,
                          total,
                          show_progress,
                          action,
                          exception_handler=None):
        def handler(future, logger):
            print_progress = self._get_progress_logger(show_progress, logger)

            try:
                print_progress("{}ing {}".format(action, f))
                result = future.result()
                # thanks to GIL no need to use lock here
                counter.increment()
                print_progress(
                    "{}ed {}, {} files out of an estimated total of {}".format(
                        action, f, counter.count(), total))
                return result
            except Exception as e:
                if exception_handler:
                    exception_handler(e, logger)
                else:
                    logger.error("Task Exception", e)

        return handler

    def _get_progress_logger(self, show_progress, logger=None):
        console = self._get_console_logger()

        def log(message):
            show_progress and console.write("{}\n".format(message))
            logger.info(message)

        return log