Beispiel #1
0
def _validate_has_data(dataflow, error_message):
    ensure_dataflow(dataflow)
    try:
        dataflow.verify_has_data()
    except (dataprep().api.dataflow.DataflowValidationError,
            dataprep().api.errorhandlers.ExecutionError) as e:
        raise DatasetValidationError(error_message + '\n' + e.compliant_message, exception=e)
Beispiel #2
0
    def _is_single_file_no_transform(dataset):
        steps = dataset._dataflow._get_steps()

        # if there is more than one step, we are going to naively assume that the resulting number of files is
        # nondeterministic
        if len(steps) > 1:
            return False

        first_step = steps[0]
        argument = first_step.arguments
        try:
            argument = argument.to_pod()
        except AttributeError:
            pass

        from azureml.data._dataset import _get_path_from_step
        original_path = _get_path_from_step(first_step.step_type, argument)
        if not original_path:
            return False

        if http_pattern.match(original_path):
            url = urlparse(original_path)
            original_path = url.path

        temp_column = "Temp Portable Path"
        from azureml.data._dataprep_helper import dataprep
        dataflow = dataset._dataflow.take(1).add_column(
            dataprep().api.functions.get_portable_path(
                dataprep().api.expressions.col("Path")), temp_column, "Path")
        path = dataflow._to_pyrecords()[0][temp_column]

        return path.strip("/").endswith(
            original_path.replace("\\", "/").strip("/"))
Beispiel #3
0
def handle_partition_format(dataflow, partition_format):
    validate_partition_format(partition_format)
    pattern, defined_date_parts, columns = parse_partition_format(
        partition_format)

    RegEx = dataprep().api.functions.RegEx
    col = dataprep().api.expressions.col
    create_datetime = dataprep().api.functions.create_datetime

    dataflow = dataflow.add_column(
        RegEx(pattern).extract_record(col('Path')), _path_record_key, None)

    for i in range(len(columns)):
        column = columns[i]
        if defined_date_parts and i == len(columns) - 1:
            parts = [
                col(_date_part_map[part], col(_path_record_key))
                for part in defined_date_parts
            ]
            exp = create_datetime(*parts)
        else:
            exp = col(column, col(_path_record_key))
        dataflow = dataflow.add_column(exp, column, None)

    dataflow = dataflow.drop_columns(_path_record_key)
    return dataflow
Beispiel #4
0
def _add_portable_path_column(dataflow):
    prefix_path = _find_path_prefix(dataflow)
    portable_path = 'Portable Path'
    get_portable_path = dataprep().api.functions.get_portable_path
    col = dataprep().api.expressions.col
    return dataflow.add_column(get_portable_path(col('Path'), prefix_path),
                               portable_path, 'Path'), portable_path
Beispiel #5
0
    def __repr__(self):
        """Format the dataset object into a string.

        :return: Return string representation of the the dataset object
        :rtype: str
        """
        content = collections.OrderedDict()
        if is_dataprep_installed():
            steps = self._dataflow._get_steps()
            step_type = steps[0].step_type
            step_arguments = steps[0].arguments

            if hasattr(step_arguments, 'to_pod'):
                step_arguments = step_arguments.to_pod()
            if step_type == 'Microsoft.DPrep.GetDatastoreFilesBlock':
                source = [
                    '(\'{}\', \'{}\')'.format(store['datastoreName'],
                                              store['path'])
                    for store in step_arguments['datastores']
                ]
            elif step_type == 'Microsoft.DPrep.GetFilesBlock':
                source = [
                    details['path']
                    for details in step_arguments['path']['resourceDetails']
                ]
            else:
                source = None

            encoder = dataprep().api.engineapi.typedefinitions.CustomEncoder \
                if hasattr(dataprep().api.engineapi.typedefinitions, 'CustomEncoder') \
                else dataprep().api.engineapi.engine.CustomEncoder
            content['source'] = source
            content['definition'] = [
                _get_step_name(s.step_type) for s in steps
            ]
        else:
            encoder = None
            global _dataprep_missing_for_repr_warned
            if not _dataprep_missing_for_repr_warned:
                _dataprep_missing_for_repr_warned = True
                import logging
                logging.getLogger().warning(
                    get_dataprep_missing_message(
                        'Warning: Cannot load "definition" and "source" for the dataset'
                    ))

        if self._registration is not None:
            content['registration'] = collections.OrderedDict([
                ('id', self.id), ('name', self.name), ('version', self.version)
            ])

            if self.description:
                content['registration']['description'] = self.description
            if self.tags:
                content['registration']['tags'] = self.tags
            content['registration'][
                'workspace'] = self._registration.workspace.__repr__()

        return json.dumps(content, indent=2, cls=encoder)
Beispiel #6
0
 def upload(src_path, destination, glob_patterns, overwrite):
     engine_api = dataprep().api.engineapi.api.get_engine_api()
     dest_si = dataprep().api._datastore_helper._to_stream_info_value(
         destination[0], destination[1])
     glob_patterns = glob_patterns or None
     engine_api.upload_directory(dataprep(
     ).api.engineapi.typedefinitions.UploadDirectoryMessageArguments(
         base_path=src_path,
         destination=dest_si,
         folder_path=src_path,
         glob_patterns=glob_patterns,
         overwrite=overwrite))
Beispiel #7
0
    def __exit__(self, *exc_details):
        """Upload files for datastore.

        :param exc_details:
        :return:
        """
        from azureml.core.datastore import Datastore
        from azureml.data._dataprep_helper import dataprep

        module_logger.debug("Enter __exit__ function of datastore cmgr")
        for key, value in self._config.items():
            df_config, force_read = self._to_data_reference_config(value)
            if self._is_upload(df_config):
                self._validate_config(df_config, key)
                ds = Datastore(workspace=self._workspace,
                               name=df_config.data_store_name)
                if os.path.isdir(df_config.path_on_compute):
                    if self._is_datastore_adlsgen1(ds):
                        module_logger.debug(
                            "AzureDataLake Gen1 used as Datastore for upload dir."
                        )
                        dataprep().api.engineapi.api.get_engine_api(
                        ).upload_directory(
                            dataprep().api.engineapi.typedefinitions.
                            UploadDirectoryMessageArguments(
                                base_path=df_config.path_on_compute,
                                folder_path=df_config.path_on_compute,
                                destination=dataprep(
                                ).api._datastore_helper._to_stream_info_value(
                                    ds, df_config.path_on_data_store),
                                force_read=force_read,
                                overwrite=df_config.overwrite,
                                concurrent_task_count=1))
                    else:
                        ds.upload(src_dir=df_config.path_on_compute,
                                  target_path=df_config.path_on_data_store,
                                  overwrite=df_config.overwrite)
                elif os.path.isfile(df_config.path_on_compute):
                    if self._is_datastore_adlsgen1(ds):
                        module_logger.debug(
                            "AzureDataLake Gen1 used as Datastore for upload file."
                        )
                        dataprep().api.engineapi.api.get_engine_api(
                        ).upload_file(
                            dataprep().api.engineapi.typedefinitions.
                            UploadFileMessageArguments(
                                base_path=os.path.dirname(
                                    df_config.path_on_compute),
                                local_path=df_config.path_on_compute,
                                destination=dataprep(
                                ).api._datastore_helper._to_stream_info_value(
                                    ds, df_config.path_on_data_store),
                                force_read=force_read,
                                overwrite=df_config.overwrite))
                    else:
                        ds.upload_files(
                            files=[df_config.path_on_compute],
                            target_path=df_config.path_on_data_store,
                            overwrite=df_config.overwrite)
        module_logger.debug("Exit __exit__ function of datastore cmgr")
Beispiel #8
0
    def read_delimited_files(
            self,
            include_path=False,
            separator=',',
            header=PromoteHeadersBehavior.ALL_FILES_HAVE_SAME_HEADERS,
            partition_format=None,
            path_glob=None,
            set_column_types=None):
        """Transform the output dataset to a tabular dataset by reading all the output as delimited files.

        :param include_path: Boolean to keep path information as column in the dataset. Defaults to False.
            This is useful when reading multiple files, and want to know which file a particular record
            originated from, or to keep useful information in file path.
        :type include_path: bool
        :param separator: The separator used to split columns.
        :type separator: str
        :param header: Controls how column headers are promoted when reading from files. Defaults to assume
            that all files have the same header.
        :type header: azureml.data.dataset_type_definitions.PromoteHeadersBehavior
        :param partition_format: Specify the partition format of path. Defaults to None.
            The partition information of each path will be extracted into columns based on the specified format.
            Format part '{column_name}' creates string column, and '{column_name:yyyy/MM/dd/HH/mm/ss}' creates
            datetime column, where 'yyyy', 'MM', 'dd', 'HH', 'mm' and 'ss' are used to extract year, month, day,
            hour, minute and second for the datetime type. The format should start from the position of first
            partition key until the end of file path.
            For example, given the path '../Accounts/2019/01/01/data.parquet' where the partition is by
            department name and time, partition_format='/{Department}/{PartitionDate:yyyy/MM/dd}/data.parquet'
            creates a string column 'Department' with the value 'Accounts' and a datetime column 'PartitionDate'
            with the value '2019-01-01'.
        :type partition_format: str
        :param path_glob: A glob pattern to filter files that will be read as delimited files. If set to None, then
            all files will be read as delimited files.
        :type path_glob: str
        :param set_column_types: A dictionary to set column data type, where key is column name and value is
            :class:`azureml.data.DataType`. Columns not in the dictionary will remain of type string. Passing None
            will result in no conversions. Entries for columns not found in the source data will not cause an error
            and will be ignored.
        :type set_column_types: dict[str, azureml.data.DataType]
        :return: A :class:`azureml.data.output_dataset_config.OutputTabularDatasetConfig` instance with instruction of
            how to convert the output into a TabularDataset.
        :rtype: azureml.data.output_dataset_config.OutputTabularDatasetConfig
        """
        dprep = dataprep()
        dataflow = dprep.Dataflow(self._engine_api)
        dataflow = TransformationMixin._filter_path(dprep, dataflow, path_glob)
        dataflow = dataflow.parse_delimited(separator=separator,
                                            headers_mode=header,
                                            encoding=dprep.FileEncoding.UTF8,
                                            quoting=False,
                                            skip_rows=0,
                                            skip_mode=dprep.SkipMode.NONE,
                                            comment=None)
        if partition_format:
            dataflow = handle_partition_format(dataflow, partition_format)
        dataflow = TransformationMixin._handle_path(dataflow, include_path)
        dataflow = _set_column_types(dataflow, set_column_types)
        return self._from_dataflow(dataflow)
Beispiel #9
0
 def _dataflow(self):
     if self._definition is None:
         raise UserErrorException(
             'Dataset definition is missing. Please check how the dataset is created.'
         )
     if self._registration and self._registration.workspace:
         dataprep().api._datastore_helper._set_auth_type(
             self._registration.workspace)
     if not isinstance(self._definition, dataprep().Dataflow):
         try:
             self._definition = dataprep().Dataflow.from_json(
                 self._definition)
         except Exception as e:
             msg = 'Failed to load dataset definition with azureml-dataprep=={}'.format(
                 dataprep().__version__)
             _get_logger().error('{}. Exception: {}'.format(msg, e))
             raise UserErrorException(
                 '{}. Please install the latest version with "pip install -U '
                 'azureml-dataprep".'.format(msg))
     return self._definition
Beispiel #10
0
    def get_profile(self, workspace=None):
        """Get data profile from the latest profile run submitted for this or the same dataset in the workspace.

        :param workspace: The workspace where profile run was submitted. Defaults to the workspace of this dataset.
            Required if dataset is not associated to a workspace.
            See https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.workspace.workspace
            for more information on workspaces.
        :type workspace: azureml.core.Workspace
        :return: Profile result from the latest profile run of type DatasetProfile.
        :rtype: azureml.data.dataset_profile.DatasetProfile
        """
        workspace = self._ensure_workspace(workspace)
        saved_dataset_id = self._ensure_saved(workspace)

        # arguments [{'generate_preview': 'True', 'row_count': '1000'}] are added to ensure
        # that requestHash is same. The GenerateProfileWithPreview API add these arguments on service side.
        # If any changes are made there, this should also be changed.
        from azureml._restclient.models import ActionRequestDto
        request_dto = ActionRequestDto(action_type=_ACTION_TYPE_PROFILE,
                                       saved_dataset_id=saved_dataset_id,
                                       arguments={
                                           'generate_preview': 'True',
                                           'row_count': '1000'
                                       })

        action_result_dto = _restclient(workspace).dataset.get_action_result(
            workspace.subscription_id,
            workspace.resource_group,
            workspace.name,
            dataset_id=_LEGACY_DATASET_ID,
            request=request_dto,
            custom_headers=_custom_headers)
        result_artifact_ids = action_result_dto.result_artifact_ids
        if result_artifact_ids is None or len(result_artifact_ids) == 0:
            raise AzureMLException(
                'Unable to fetch profile results. Please submit a new profile run.'
            )
        result_artifact = result_artifact_ids[0]
        from azureml._restclient.artifacts_client import ArtifactsClient
        content = ArtifactsClient(
            workspace.service_context).download_artifact_contents_to_string(
                *result_artifact.split("/", 2))
        try:
            from azureml.data.dataset_profile import DatasetProfile
            profile = DatasetProfile(
                saved_dataset_id, action_result_dto.run_id,
                action_result_dto.experiment_name, workspace,
                dataprep().DataProfile._from_json(content))
        except Exception:
            errormsg = 'Unable to fetch profile since profile result is corrupted. Please submit a new profile run.'
            _get_logger().error(errormsg)
            raise AzureMLException(errormsg)

        return profile
Beispiel #11
0
    def download(self, target_path=None, overwrite=False):
        """Download file streams defined by the dataset as local files.

        .. remarks::

            If target_path starts with a /, then it will be treated as an absolute path. If it doesn't start
            with a /, then it will be treated as a relative path relative to the current working directory.

        :param target_path: The local directory to download the files to. If None, the data will be downloaded
            into a temporary directory.
        :type target_path: str
        :param overwrite: Indicates whether to overwrite existing files. The default is False. Existing files will
            be overwritten if overwrite is set to True; otherwise an exception will be raised.
        :type overwrite: bool
        :return: Returns an array of file paths for each file downloaded.
        :rtype: builtin.list(str)
        """
        with _get_tracer().start_as_current_span(
                'download', user_facing_name='Dataset.download') as span:
            target_path = _ensure_path(target_path)
            download_list = [
                os.path.abspath(os.path.join(target_path, '.' + p))
                for p in self._to_path(activity='download.to_path')
            ]

            if self.id:
                span.set_user_facing_attribute('dataset_id', self.id)
            span.set_user_facing_attribute('target_path', target_path)

            if not overwrite:
                for p in download_list:
                    # encode p to avoid UnicodeEncodeError from os.path.exists
                    if os.path.exists(_encode_if_needed(p)):
                        raise UserErrorException(
                            'File "{}" already exists. Set overwrite=True to overwrite it.'
                            .format(p))
            base_path = dataprep().api.datasources.LocalFileOutput(target_path)

            dataflow, portable_path = _add_portable_path_column(self._dataflow)
            dataflow = dataflow.write_streams(streams_column='Path',
                                              base_path=base_path,
                                              file_names_column=portable_path)

            dataflow = get_dataflow_for_execution(dataflow, 'download',
                                                  'FileDataset')
            _try_execute(
                dataflow.run_local, 'download', None if self.id is None else {
                    'id': self.id,
                    'name': self.name,
                    'version': self.version
                })
            return download_list
Beispiel #12
0
def _profile_from_action(workspace, result):
    result_artifact_ids = result.result_artifact_ids
    if result_artifact_ids is None or len(result_artifact_ids) == 0:
        return (None, None)
    result_artifact = result_artifact_ids[0]
    content = ArtifactsClient(
        workspace.service_context).download_artifact_contents_to_string(
            *result_artifact.split("/", 2))
    try:
        profile = dataprep().DataProfile._from_json(content)
    except:
        raise RuntimeError('Profile result is corrupted.')
    if hasattr(result, 'is_up_to_date_error') and result.is_up_to_date_error:
        raise RuntimeError(result.is_up_to_date_error)
    if hasattr(result, 'is_up_to_date'):
        return (profile, result.is_up_to_date)
    return (profile, None)
Beispiel #13
0
 def _engine_api(self):
     return dataprep().api.engineapi.api.get_engine_api()
Beispiel #14
0
 def _validate_timestamp_columns(self, columns_list):
     FieldType = dataprep().api.engineapi.typedefinitions.FieldType
     columns = list(filter(lambda col: col is not None, columns_list))
     _validate_has_columns(self._dataflow, columns,
                           [FieldType.DATE for c in columns])