# Specify the run configuration run_config = RunConfiguration() run_config.environment.docker.enabled = True run_config.environment.python.conda_dependencies = cd # Pipeline definition inputdata = DataReference( datastore=Datastore.get(ws, "trainingdata"), data_reference_name="data" ) train_model = PythonScriptStep( script_name="./train.py", name="fit-nlp-model", inputs=[inputdata.as_download(path_on_compute="./data")], runconfig=run_config, compute_target=compute_target, ) pipeline = Pipeline( workspace=ws, steps=[train_model], description="Builds Keras model for detecting component defects", ) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--publish", action="store_true")
class AbstractAzureStorageDatastore(AbstractDatastore): """Represents the base class for datastores that save connection information to Azure Blob and Azure File storage. You should not work with this class directly. To create a datastore, use one of the ``register*`` methods of the Datastore class, for example, :meth:`azureml.core.datastore.Datastore.register_azure_blob_container`. Note: When using a datastore to access data, you must have permission to access the data, which depends on the credentials registered with the datastore. :param workspace: The workspace this datastore belongs to. :type workspace: azureml.core.workspace.Workspace :param name: The name of the datastore. It can only contain alphanumeric characters or - or _. :type name: str :param datastore_type: The type of this datastore, either "AzureBlob" or "AzureFile". :type datastore_type: str :param container_name: The container name. :type container_name: str :param account_name: The storage account name. :type account_name: str :param sas_token: The SAS token for accessing this container, defaults to None. :type sas_token: str, optional :param account_key: The storage account key, defaults to None. :type account_key: str, optional :param protocol: The protocol to use to connect to the storage account. If None, defaults to https. :type protocol: str, optional :param endpoint: The endpoint of the blob container. If None, defaults to core.windows.net. :type endpoint: str, optional """ __metaclass__ = ABCMeta _sanitize_regex = re.compile(r"^(\.*[/\\])*") def __init__(self, workspace, name, datastore_type, container_name, account_name, sas_token=None, account_key=None, protocol=None, endpoint=None): """Class AbstractAzureStorageDatastore constructor. :param workspace: The workspace this datastore belongs to. :type workspace: azureml.core.workspace.Workspace :param name: The name of the datastore. It can only contain alphanumeric characters or - or _. :type name: str :param datastore_type: The type of this datastore, either "AzureBlob" or "AzureFile". :type datastore_type: str :param container_name: The container name. :type container_name: str :param account_name: The storage account name. :type account_name: str :param sas_token: The SAS token for accessing this container, defaults to None. :type sas_token: str, optional :param account_key: The storage account key, defaults to None. :type account_key: str, optional :param protocol: The protocol to use to connect to the storage account. If None, defaults to https. :type protocol: str, optional :param endpoint: The endpoint of the blob container. If None, defaults to core.windows.net. :type endpoint: str, optional """ super(AbstractAzureStorageDatastore, self).__init__(workspace, name, datastore_type) self.container_name = container_name self.account_name = account_name self.sas_token = sas_token self.account_key = account_key self.credential_type = 'None' self.protocol = protocol self.endpoint = endpoint if account_key: self.credential_type = 'AccountKey' if sas_token: self.credential_type = 'Sas' self._num_workers = 32 self._data_reference = DataReference(datastore=self) def __repr__(self): """Return the string representation of the AbstractAzureStorageDatastore object. :return: String representation of the AbstractAzureStorageDatastore object :rtype: str """ content = collections.OrderedDict() content['name'] = self.name content['container_name'] = self.container_name content['account_name'] = self.account_name content['protocol'] = self.protocol content['endpoint'] = self.endpoint return json.dumps(content, indent=2) def __str__(self): """Return the string representation of the AbstractAzureStorageDatastore object. :return: String representation of the AbstractAzureStorageDatastore object :rtype: str """ return self.__repr__() def path(self, path=None, data_reference_name=None): """Return corresponding data reference object. :param path: The relative path on the datastore. :type path: str :param data_reference_name: The name of the data reference. :type data_reference_name: str :return: The data reference object. :rtype: azureml.data.data_reference.DataReference """ return self._data_reference.path(path, data_reference_name) def as_download(self, path_on_compute=None): """Return data reference object with download mode. :param path_on_compute: The relative path on the compute. :type path_on_compute: str :return: The data reference object. :rtype: azureml.data.data_reference.DataReference """ return self._data_reference.as_download(path_on_compute) def as_upload(self, path_on_compute=None): """Return data reference object with upload mode. :param path_on_compute: The relative path on the compute. :type path_on_compute: str :return: The data reference object. :rtype: azureml.data.data_reference.DataReference """ return self._data_reference.as_upload(path_on_compute) def as_mount(self): """Return data reference object with mount mode. :param path_on_compute: The relative path on the compute. :type path_on_compute: str :return: The data reference object. :rtype: azureml.data.data_reference.DataReference """ return self._data_reference.as_mount() @abstractmethod def download(self, target_path, prefix=None, overwrite=False, show_progress=True): """Download paths with prefix to target_path. :param target_path: :param prefix: :param overwrite: :param show_progress: :return: """ raise NotImplementedError() @abstractmethod def upload(self, src_dir, target_path=None, overwrite=False, show_progress=True): """Upload src_dir to target_path. :param src_dir: :param target_path: :param overwrite: :param show_progress: :return: """ raise NotImplementedError() @abstractmethod def upload_files(self, files, relative_root=None, target_path=None, overwrite=False, show_progress=True): """Upload files to target_path. :param files: :param relative_root: relative path in target :param target_path: :param overwrite: :param show_progress: :return: """ raise NotImplementedError() def _get_data_reference(self): return self._data_reference @property def is_sas(self): """Deprecated: use ``credential_type`` property. This property is deprecated, please use the property ``credential_type`` to determine the credential type. """ module_logger.warning( "This property is deprecated, please use the property \"credential_type\"" + " to determine the credential type.") return bool(self.sas_token) def _as_dict(self, hide_secret=True): output = super(AbstractAzureStorageDatastore, self)._as_dict() output["container_name"] = self.container_name output["account_name"] = self.account_name output["protocol"] = self.protocol output["endpoint"] = self.endpoint if not hide_secret: output["credential_type"] = self.credential_type output["sas_token"] = self.sas_token output["account_key"] = self.account_key return output def _get_default_request_session(self): a = requests.adapters.HTTPAdapter(pool_connections=self._num_workers, pool_maxsize=self._num_workers * 2, max_retries=create_retry()) s = requests.Session() s.mount("http://", a) s.mount("https://", a) return s def _get_upload_from_dir(self, src_path, target_path): src_path = src_path.rstrip("/\\") if not os.path.isdir(src_path): raise UserErrorException("src_path must be a directory.") paths_to_upload = [] for dirpath, dirnames, filenames in os.walk(src_path): paths_to_upload += self._get_upload_from_files( map(lambda f: os.path.join(dirpath, f), filenames), target_path, src_path, True) return paths_to_upload def _get_upload_from_files(self, file_paths, target_path, relative_root, skip_root_check): paths_to_upload = [] target_path = AbstractAzureStorageDatastore._sanitize_target_path( target_path) for file_path in file_paths: if not skip_root_check and relative_root not in file_path and relative_root != "/": raise UserErrorException( "relative_root: '{}' is not part of the file_path: '{}'.". format(relative_root, file_path)) if not os.path.isfile(file_path): err_msg = "'{}' does not point to a file. " + \ "Please upload the file to cloud first if running in a cloud notebook." raise UserErrorException(err_msg.format(file_path)) target_file_path = to_unix_path(file_path) if relative_root != "/": # need to do this because Windows doesn't support relpath if the partition is different target_file_path = os.path.relpath(target_file_path, to_unix_path(relative_root)) else: # strip away / otherwise we will create a folder in the container with no name target_file_path = target_file_path.lstrip("/") if target_path: target_file_path = os.path.join(target_path, target_file_path) paths_to_upload.append((file_path, target_file_path)) return paths_to_upload @staticmethod def _sanitize_target_path(target_path): if not target_path: return target_path return AbstractAzureStorageDatastore._sanitize_regex.sub( "", target_path) def _start_upload_task(self, paths_to_upload, overwrite, exists, show_progress, task_generator): # it's an estimated total because we might skip some files estimated_total = len(paths_to_upload) counter = _Counter() console = self._get_progress_logger(show_progress, module_logger) console("Uploading an estimated of {} files".format(estimated_total)) def exception_handler(e, logger): logger.error( "Upload failed, please make sure target_path does not start with invalid characters.", e) with TaskQueue(flush_timeout_seconds=float('inf'), _ident=__name__, _parent_logger=module_logger) as tq: for (src_file_path, target_file_path) in paths_to_upload: if not overwrite: if exists(target_file_path): estimated_total -= 1 console( "Target already exists. Skipping upload for {}". format(target_file_path)) continue task_fn = task_generator(target_file_path, src_file_path) future_handler = self._get_task_handler( src_file_path, counter, estimated_total, show_progress, "Upload", exception_handler) future = tq.create_future(task_fn) async_task = AsyncTask( future, handler=future_handler, _ident="task_upload_{}".format(target_file_path), _parent_logger=module_logger) tq.add_task(async_task) console("Uploaded {} files".format(counter.count())) return counter.count() def _get_task_handler(self, f, counter, total, show_progress, action, exception_handler=None): def handler(future, logger): print_progress = self._get_progress_logger(show_progress, logger) try: print_progress("{}ing {}".format(action, f)) result = future.result() # thanks to GIL no need to use lock here counter.increment() print_progress( "{}ed {}, {} files out of an estimated total of {}".format( action, f, counter.count(), total)) return result except Exception as e: if exception_handler: exception_handler(e, logger) else: logger.error("Task Exception", e) return handler def _get_progress_logger(self, show_progress, logger=None): console = self._get_console_logger() def log(message): show_progress and console.write("{}\n".format(message)) logger.info(message) return log