Exemple #1
0
    def remove_tags(self, tags=None):
        """Remove the specified keys from tags dictionary of this dataset.

        :param tags: The list of keys to remove.
        :type tags: builtin.list[str]
        :return: The updated dataset object.
        :rtype: typing.Union[azureml.data.TabularDataset, azureml.data.FileDataset]
        """
        if not self._registration or not self._registration.workspace or not self._registration.registered_id:
            return UserErrorException(
                'To remove tags from this dataset it must be registered.')
        workspace = self._registration.workspace

        def request():
            updatedTags = deepcopy(self._registration.tags)
            for item in set(tags).intersection(updatedTags):
                del updatedTags[item]

            return _restclient(workspace).dataset.update_dataset(
                workspace.subscription_id,
                workspace.resource_group,
                workspace.name,
                dataset_id=self._registration.registered_id,
                new_dataset_dto=_dataset_to_dto(
                    self, self.name, self.description, updatedTags,
                    self._registration.registered_id),
                custom_headers=self._get_telemetry_headers())

        success, result = _make_request(request)
        if not success:
            raise result
        result_dto = _dto_to_dataset(workspace, result)
        self._registration.tags = result_dto.tags
        return result_dto
Exemple #2
0
    def _ensure_saved_internal(self, workspace):
        if not self._registration or not self._registration.saved_id:
            # only call service when dataset is not saved yet
            def request():
                return _restclient(workspace).dataset.ensure_saved(
                    subscription_id=workspace.subscription_id,
                    resource_group_name=workspace.resource_group,
                    workspace_name=workspace.name,
                    dataset=_dataset_to_saved_dataset_dto(self),
                    custom_headers=self._get_telemetry_headers())

            success, result = _make_request(request)
            if not success:
                raise result
            saved_dataset = _saved_dataset_dto_to_dataset(workspace, result)

            # modify _definition using service response
            self._definition = saved_dataset._definition

            # modify self._registration.saved_id using service response
            if self._registration:
                self._registration.saved_id = saved_dataset._registration.saved_id
            else:
                self._registration = saved_dataset._registration

        return self._registration.saved_id
Exemple #3
0
    def register(self,
                 workspace,
                 name,
                 description=None,
                 tags=None,
                 create_new_version=False):
        """Register the dataset to the provided workspace.

        :param workspace: The workspace to register the dataset.
        :type workspace: azureml.core.Workspace
        :param name: The name to register the dataset with.
        :type name: str
        :param description: A text description of the dataset. Defaults to None.
        :type description: str
        :param tags: Dictionary of key value tags to give the dataset. Defaults to None.
        :type tags: dict[str, str]
        :param create_new_version: Boolean to register the dataset as a new version under the specified name.
        :type create_new_version: bool
        :return: The registered dataset object.
        :rtype: typing.Union[azureml.data.TabularDataset, azureml.data.FileDataset]
        """
        def request():
            return _restclient(workspace).dataset.register(
                workspace.subscription_id,
                workspace.resource_group,
                workspace.name,
                dataset_dto=_dataset_to_dto(self, name, description, tags),
                if_exists_ok=create_new_version,
                update_definition_if_exists=create_new_version,
                custom_headers=self._get_telemetry_headers())

        def handle_error(error):
            status_code = error.response.status_code
            if status_code == 409:
                return UserErrorException((
                    'There is already a dataset registered under name "{}". '
                    'Specify `create_new_version=True` to register the dataset as a new version. '
                    'Use `update`, `add_tags`, or `remove_tags` to change only the description or tags.'
                ).format(name))
            if status_code == 400:
                regex = re.compile(
                    r'has been registered as (.+):([0-9]+) \(name:version\).',
                    re.IGNORECASE)
                matches = regex.findall(error.message)
                if len(matches) == 1:
                    existing_name, existing_version = matches[0]
                    return UserErrorException((
                        'An identical dataset had already been registered, which can '
                        'be retrieved with `Dataset.get_by_name(workspace, name="{}", version={})`.'
                    ).format(existing_name, existing_version))

        success, result = _make_request(request, handle_error)
        if not success:
            raise result
        dataset = _dto_to_dataset(workspace, result)
        self.__class__._track_output_reference_lineage(dataset)
        return dataset
Exemple #4
0
    def _get_by_id(workspace, id):
        def request_for_registered():
            return _restclient(
                workspace
            ).dataset.get_datasets_by_saved_dataset_id(
                subscription_id=workspace.subscription_id,
                resource_group_name=workspace.resource_group,
                workspace_name=workspace.name,
                saved_dataset_id=id,
                page_size=
                1,  # just need the 1st (can only be more than one for dataset created in the old age)
                custom_headers=_custom_headers)

        success, result = _make_request(request_for_registered)
        if success and len(result.value) == 1:
            dataset = _dto_to_dataset(workspace, result.value[0])
        else:

            def request_for_unregistered():
                return _restclient(workspace).dataset.get_by_id(
                    subscription_id=workspace.subscription_id,
                    resource_group_name=workspace.resource_group,
                    workspace_name=workspace.name,
                    id=id,
                    resolve_legacy_id=True,
                    custom_headers=_custom_headers)

            def handle_error(error):
                if error.response.status_code == 404:
                    return UserErrorException(
                        'Cannot find dataset with id "{}" in the workspace.'.
                        format(id))

            success, result = _make_request(request_for_unregistered,
                                            handle_error)
            if not success:
                raise result
            dataset = _saved_dataset_dto_to_dataset(workspace, result)
        warn_deprecated_blocks(dataset)
        return dataset
Exemple #5
0
    def add_tags(self, tags=None):
        """Add key value pairs to the tags dictionary of this dataset.

        :param tags: The dictionary of tags to add.
        :type tags: dict[str, str]
        :return: The updated dataset object.
        :rtype: typing.Union[azureml.data.TabularDataset, azureml.data.FileDataset]
        """
        if not self._registration or not self._registration.workspace or not self._registration.registered_id:
            return UserErrorException(
                'To add tags to this dataset it must be registered.')
        workspace = self._registration.workspace

        def request():
            duplicate_keys = []
            for item in set(tags).intersection(self._registration.tags):
                if self._registration.tags[item] != tags[item]:
                    duplicate_keys.append(item)
            if len(duplicate_keys) > 0:
                raise UserErrorException(
                    ('Dataset already contains different values for tags '
                     'with the following keys {}').format(duplicate_keys))

            updatedTags = deepcopy(self._registration.tags)
            updatedTags.update(tags)

            return _restclient(workspace).dataset.update_dataset(
                workspace.subscription_id,
                workspace.resource_group,
                workspace.name,
                dataset_id=self._registration.registered_id,
                new_dataset_dto=_dataset_to_dto(
                    self, self.name, self.description, updatedTags,
                    self._registration.registered_id),
                custom_headers=self._get_telemetry_headers())

        success, result = _make_request(request)
        if not success:
            raise result
        result_dto = _dto_to_dataset(workspace, result)
        self._registration.tags = result_dto.tags
        return result_dto
Exemple #6
0
    def update(self, description=None, tags=None):
        """Perform an in-place update of the dataset.

        :param description: The new description to use for the dataset. This description replaces the existing
            description. Defaults to existing description. To clear description, enter empty string.
        :type description: str
        :param tags: A dictionary of tags to update the dataset with. These tags replace existing tags for the
            dataset. Defaults to existing tags. To clear tags, enter empty dictionary.
        :type tags: dict[str, str]
        :return: The updated dataset object.
        :rtype: typing.Union[azureml.data.TabularDataset, azureml.data.FileDataset]
        """
        if not self._registration or not self._registration.workspace or not self._registration.registered_id:
            return UserErrorException(
                'To update this dataset it must be registered.')
        workspace = self._registration.workspace

        def request():
            updated_description = description
            updated_tags = tags
            if description is None:
                updated_description = self._registration.description
            if tags is None:
                updated_tags = self._registration.tags

            return _restclient(workspace).dataset.update_dataset(
                workspace.subscription_id,
                workspace.resource_group,
                workspace.name,
                dataset_id=self._registration.registered_id,
                new_dataset_dto=_dataset_to_dto(
                    self, self.name, updated_description, updated_tags,
                    self._registration.registered_id),
                custom_headers=self._get_telemetry_headers())

        success, result = _make_request(request)
        if not success:
            raise result
        result_dto = _dto_to_dataset(workspace, result)
        self._registration.tags = result_dto.tags
        self._registration.description = result_dto.description
        return result_dto
Exemple #7
0
    def _register(self,
                  workspace,
                  name,
                  description=None,
                  tags=None,
                  create_new_version=False):
        def request():
            return _restclient(workspace).dataset.register(
                workspace.subscription_id,
                workspace.resource_group,
                workspace.name,
                dataset_dto=_dataset_to_dto(self, name, description, tags),
                if_exists_ok=create_new_version,
                update_definition_if_exists=create_new_version,
                custom_headers=self._get_telemetry_headers())

        def handle_error(error):
            status_code = error.response.status_code
            if status_code == 409:
                return UserErrorException((
                    'There is already a dataset registered under name "{}". ' +
                    'Specify `create_new_version=True` to register the dataset as a new version.'
                ).format(name))
            if status_code == 400:
                regex = re.compile(
                    r'has been registered as (.+):([0-9]+) \(name:version\).',
                    re.IGNORECASE)
                matches = regex.findall(error.message)
                if len(matches) == 1:
                    existing_name, existing_version = matches[0]
                    return UserErrorException((
                        'An identical dataset had already been registered, which can '
                        +
                        'be retrieved with `Dataset.get_by_name(workspace, name="{}", version={})`.'
                    ).format(existing_name, existing_version))

        success, result = _make_request(request, handle_error)
        if not success:
            raise result
        return _dto_to_dataset(workspace, result)
Exemple #8
0
    def unregister_all_versions(self):
        """Unregister all versions under the registration name of this dataset from the workspace.

        .. remarks::

            The operation does not change any source data.
        """
        if not self._registration or not self._registration.workspace or not self._registration.registered_id:
            return  # no-op if dataset is not registered
        workspace = self._registration.workspace

        def request():
            return _restclient(workspace).dataset.unregister_dataset(
                workspace.subscription_id,
                workspace.resource_group,
                workspace.name,
                self.name,
                custom_headers=_custom_headers)

        success, result = _make_request(request)
        if not success:
            raise result
        self._registration = None
Exemple #9
0
    def _get_by_name(workspace, name, version):
        if version != 'latest' and version is not None:
            try:
                version = int(version)
            except:
                raise UserErrorException(
                    'Invalid value {} for version. Version value must be number or "latest".'
                    .format(version))
        else:
            version = None

        def request():
            dto = _restclient(workspace).dataset.get_dataset_by_name(
                workspace.subscription_id,
                workspace.resource_group,
                workspace.name,
                dataset_name=name,
                version_id=version,
                custom_headers=_custom_headers)

            return dto

        def handle_error(error):
            if error.response.status_code == 404:
                return UserErrorException(
                    'Cannot find dataset registered with name "{}"{} in the workspace.'
                    .format(
                        name, '' if version == 'latest' else
                        ' (version: {})'.format(version)))

        success, result = _make_request(request, handle_error)
        if not success:
            raise result
        dataset = _dto_to_dataset(workspace, result)
        warn_deprecated_blocks(dataset)
        return dataset