def _validate_has_data(dataflow, error_message): ensure_dataflow(dataflow) try: dataflow.verify_has_data() except (dataprep().api.dataflow.DataflowValidationError, dataprep().api.errorhandlers.ExecutionError) as e: raise DatasetValidationError(error_message + '\n' + e.compliant_message, exception=e)
def _is_single_file_no_transform(dataset): steps = dataset._dataflow._get_steps() # if there is more than one step, we are going to naively assume that the resulting number of files is # nondeterministic if len(steps) > 1: return False first_step = steps[0] argument = first_step.arguments try: argument = argument.to_pod() except AttributeError: pass from azureml.data._dataset import _get_path_from_step original_path = _get_path_from_step(first_step.step_type, argument) if not original_path: return False if http_pattern.match(original_path): url = urlparse(original_path) original_path = url.path temp_column = "Temp Portable Path" from azureml.data._dataprep_helper import dataprep dataflow = dataset._dataflow.take(1).add_column( dataprep().api.functions.get_portable_path( dataprep().api.expressions.col("Path")), temp_column, "Path") path = dataflow._to_pyrecords()[0][temp_column] return path.strip("/").endswith( original_path.replace("\\", "/").strip("/"))
def handle_partition_format(dataflow, partition_format): validate_partition_format(partition_format) pattern, defined_date_parts, columns = parse_partition_format( partition_format) RegEx = dataprep().api.functions.RegEx col = dataprep().api.expressions.col create_datetime = dataprep().api.functions.create_datetime dataflow = dataflow.add_column( RegEx(pattern).extract_record(col('Path')), _path_record_key, None) for i in range(len(columns)): column = columns[i] if defined_date_parts and i == len(columns) - 1: parts = [ col(_date_part_map[part], col(_path_record_key)) for part in defined_date_parts ] exp = create_datetime(*parts) else: exp = col(column, col(_path_record_key)) dataflow = dataflow.add_column(exp, column, None) dataflow = dataflow.drop_columns(_path_record_key) return dataflow
def _add_portable_path_column(dataflow): prefix_path = _find_path_prefix(dataflow) portable_path = 'Portable Path' get_portable_path = dataprep().api.functions.get_portable_path col = dataprep().api.expressions.col return dataflow.add_column(get_portable_path(col('Path'), prefix_path), portable_path, 'Path'), portable_path
def __repr__(self): """Format the dataset object into a string. :return: Return string representation of the the dataset object :rtype: str """ content = collections.OrderedDict() if is_dataprep_installed(): steps = self._dataflow._get_steps() step_type = steps[0].step_type step_arguments = steps[0].arguments if hasattr(step_arguments, 'to_pod'): step_arguments = step_arguments.to_pod() if step_type == 'Microsoft.DPrep.GetDatastoreFilesBlock': source = [ '(\'{}\', \'{}\')'.format(store['datastoreName'], store['path']) for store in step_arguments['datastores'] ] elif step_type == 'Microsoft.DPrep.GetFilesBlock': source = [ details['path'] for details in step_arguments['path']['resourceDetails'] ] else: source = None encoder = dataprep().api.engineapi.typedefinitions.CustomEncoder \ if hasattr(dataprep().api.engineapi.typedefinitions, 'CustomEncoder') \ else dataprep().api.engineapi.engine.CustomEncoder content['source'] = source content['definition'] = [ _get_step_name(s.step_type) for s in steps ] else: encoder = None global _dataprep_missing_for_repr_warned if not _dataprep_missing_for_repr_warned: _dataprep_missing_for_repr_warned = True import logging logging.getLogger().warning( get_dataprep_missing_message( 'Warning: Cannot load "definition" and "source" for the dataset' )) if self._registration is not None: content['registration'] = collections.OrderedDict([ ('id', self.id), ('name', self.name), ('version', self.version) ]) if self.description: content['registration']['description'] = self.description if self.tags: content['registration']['tags'] = self.tags content['registration'][ 'workspace'] = self._registration.workspace.__repr__() return json.dumps(content, indent=2, cls=encoder)
def upload(src_path, destination, glob_patterns, overwrite): engine_api = dataprep().api.engineapi.api.get_engine_api() dest_si = dataprep().api._datastore_helper._to_stream_info_value( destination[0], destination[1]) glob_patterns = glob_patterns or None engine_api.upload_directory(dataprep( ).api.engineapi.typedefinitions.UploadDirectoryMessageArguments( base_path=src_path, destination=dest_si, folder_path=src_path, glob_patterns=glob_patterns, overwrite=overwrite))
def __exit__(self, *exc_details): """Upload files for datastore. :param exc_details: :return: """ from azureml.core.datastore import Datastore from azureml.data._dataprep_helper import dataprep module_logger.debug("Enter __exit__ function of datastore cmgr") for key, value in self._config.items(): df_config, force_read = self._to_data_reference_config(value) if self._is_upload(df_config): self._validate_config(df_config, key) ds = Datastore(workspace=self._workspace, name=df_config.data_store_name) if os.path.isdir(df_config.path_on_compute): if self._is_datastore_adlsgen1(ds): module_logger.debug( "AzureDataLake Gen1 used as Datastore for upload dir." ) dataprep().api.engineapi.api.get_engine_api( ).upload_directory( dataprep().api.engineapi.typedefinitions. UploadDirectoryMessageArguments( base_path=df_config.path_on_compute, folder_path=df_config.path_on_compute, destination=dataprep( ).api._datastore_helper._to_stream_info_value( ds, df_config.path_on_data_store), force_read=force_read, overwrite=df_config.overwrite, concurrent_task_count=1)) else: ds.upload(src_dir=df_config.path_on_compute, target_path=df_config.path_on_data_store, overwrite=df_config.overwrite) elif os.path.isfile(df_config.path_on_compute): if self._is_datastore_adlsgen1(ds): module_logger.debug( "AzureDataLake Gen1 used as Datastore for upload file." ) dataprep().api.engineapi.api.get_engine_api( ).upload_file( dataprep().api.engineapi.typedefinitions. UploadFileMessageArguments( base_path=os.path.dirname( df_config.path_on_compute), local_path=df_config.path_on_compute, destination=dataprep( ).api._datastore_helper._to_stream_info_value( ds, df_config.path_on_data_store), force_read=force_read, overwrite=df_config.overwrite)) else: ds.upload_files( files=[df_config.path_on_compute], target_path=df_config.path_on_data_store, overwrite=df_config.overwrite) module_logger.debug("Exit __exit__ function of datastore cmgr")
def read_delimited_files( self, include_path=False, separator=',', header=PromoteHeadersBehavior.ALL_FILES_HAVE_SAME_HEADERS, partition_format=None, path_glob=None, set_column_types=None): """Transform the output dataset to a tabular dataset by reading all the output as delimited files. :param include_path: Boolean to keep path information as column in the dataset. Defaults to False. This is useful when reading multiple files, and want to know which file a particular record originated from, or to keep useful information in file path. :type include_path: bool :param separator: The separator used to split columns. :type separator: str :param header: Controls how column headers are promoted when reading from files. Defaults to assume that all files have the same header. :type header: azureml.data.dataset_type_definitions.PromoteHeadersBehavior :param partition_format: Specify the partition format of path. Defaults to None. The partition information of each path will be extracted into columns based on the specified format. Format part '{column_name}' creates string column, and '{column_name:yyyy/MM/dd/HH/mm/ss}' creates datetime column, where 'yyyy', 'MM', 'dd', 'HH', 'mm' and 'ss' are used to extract year, month, day, hour, minute and second for the datetime type. The format should start from the position of first partition key until the end of file path. For example, given the path '../Accounts/2019/01/01/data.parquet' where the partition is by department name and time, partition_format='/{Department}/{PartitionDate:yyyy/MM/dd}/data.parquet' creates a string column 'Department' with the value 'Accounts' and a datetime column 'PartitionDate' with the value '2019-01-01'. :type partition_format: str :param path_glob: A glob pattern to filter files that will be read as delimited files. If set to None, then all files will be read as delimited files. :type path_glob: str :param set_column_types: A dictionary to set column data type, where key is column name and value is :class:`azureml.data.DataType`. Columns not in the dictionary will remain of type string. Passing None will result in no conversions. Entries for columns not found in the source data will not cause an error and will be ignored. :type set_column_types: dict[str, azureml.data.DataType] :return: A :class:`azureml.data.output_dataset_config.OutputTabularDatasetConfig` instance with instruction of how to convert the output into a TabularDataset. :rtype: azureml.data.output_dataset_config.OutputTabularDatasetConfig """ dprep = dataprep() dataflow = dprep.Dataflow(self._engine_api) dataflow = TransformationMixin._filter_path(dprep, dataflow, path_glob) dataflow = dataflow.parse_delimited(separator=separator, headers_mode=header, encoding=dprep.FileEncoding.UTF8, quoting=False, skip_rows=0, skip_mode=dprep.SkipMode.NONE, comment=None) if partition_format: dataflow = handle_partition_format(dataflow, partition_format) dataflow = TransformationMixin._handle_path(dataflow, include_path) dataflow = _set_column_types(dataflow, set_column_types) return self._from_dataflow(dataflow)
def _dataflow(self): if self._definition is None: raise UserErrorException( 'Dataset definition is missing. Please check how the dataset is created.' ) if self._registration and self._registration.workspace: dataprep().api._datastore_helper._set_auth_type( self._registration.workspace) if not isinstance(self._definition, dataprep().Dataflow): try: self._definition = dataprep().Dataflow.from_json( self._definition) except Exception as e: msg = 'Failed to load dataset definition with azureml-dataprep=={}'.format( dataprep().__version__) _get_logger().error('{}. Exception: {}'.format(msg, e)) raise UserErrorException( '{}. Please install the latest version with "pip install -U ' 'azureml-dataprep".'.format(msg)) return self._definition
def get_profile(self, workspace=None): """Get data profile from the latest profile run submitted for this or the same dataset in the workspace. :param workspace: The workspace where profile run was submitted. Defaults to the workspace of this dataset. Required if dataset is not associated to a workspace. See https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.workspace.workspace for more information on workspaces. :type workspace: azureml.core.Workspace :return: Profile result from the latest profile run of type DatasetProfile. :rtype: azureml.data.dataset_profile.DatasetProfile """ workspace = self._ensure_workspace(workspace) saved_dataset_id = self._ensure_saved(workspace) # arguments [{'generate_preview': 'True', 'row_count': '1000'}] are added to ensure # that requestHash is same. The GenerateProfileWithPreview API add these arguments on service side. # If any changes are made there, this should also be changed. from azureml._restclient.models import ActionRequestDto request_dto = ActionRequestDto(action_type=_ACTION_TYPE_PROFILE, saved_dataset_id=saved_dataset_id, arguments={ 'generate_preview': 'True', 'row_count': '1000' }) action_result_dto = _restclient(workspace).dataset.get_action_result( workspace.subscription_id, workspace.resource_group, workspace.name, dataset_id=_LEGACY_DATASET_ID, request=request_dto, custom_headers=_custom_headers) result_artifact_ids = action_result_dto.result_artifact_ids if result_artifact_ids is None or len(result_artifact_ids) == 0: raise AzureMLException( 'Unable to fetch profile results. Please submit a new profile run.' ) result_artifact = result_artifact_ids[0] from azureml._restclient.artifacts_client import ArtifactsClient content = ArtifactsClient( workspace.service_context).download_artifact_contents_to_string( *result_artifact.split("/", 2)) try: from azureml.data.dataset_profile import DatasetProfile profile = DatasetProfile( saved_dataset_id, action_result_dto.run_id, action_result_dto.experiment_name, workspace, dataprep().DataProfile._from_json(content)) except Exception: errormsg = 'Unable to fetch profile since profile result is corrupted. Please submit a new profile run.' _get_logger().error(errormsg) raise AzureMLException(errormsg) return profile
def download(self, target_path=None, overwrite=False): """Download file streams defined by the dataset as local files. .. remarks:: If target_path starts with a /, then it will be treated as an absolute path. If it doesn't start with a /, then it will be treated as a relative path relative to the current working directory. :param target_path: The local directory to download the files to. If None, the data will be downloaded into a temporary directory. :type target_path: str :param overwrite: Indicates whether to overwrite existing files. The default is False. Existing files will be overwritten if overwrite is set to True; otherwise an exception will be raised. :type overwrite: bool :return: Returns an array of file paths for each file downloaded. :rtype: builtin.list(str) """ with _get_tracer().start_as_current_span( 'download', user_facing_name='Dataset.download') as span: target_path = _ensure_path(target_path) download_list = [ os.path.abspath(os.path.join(target_path, '.' + p)) for p in self._to_path(activity='download.to_path') ] if self.id: span.set_user_facing_attribute('dataset_id', self.id) span.set_user_facing_attribute('target_path', target_path) if not overwrite: for p in download_list: # encode p to avoid UnicodeEncodeError from os.path.exists if os.path.exists(_encode_if_needed(p)): raise UserErrorException( 'File "{}" already exists. Set overwrite=True to overwrite it.' .format(p)) base_path = dataprep().api.datasources.LocalFileOutput(target_path) dataflow, portable_path = _add_portable_path_column(self._dataflow) dataflow = dataflow.write_streams(streams_column='Path', base_path=base_path, file_names_column=portable_path) dataflow = get_dataflow_for_execution(dataflow, 'download', 'FileDataset') _try_execute( dataflow.run_local, 'download', None if self.id is None else { 'id': self.id, 'name': self.name, 'version': self.version }) return download_list
def _profile_from_action(workspace, result): result_artifact_ids = result.result_artifact_ids if result_artifact_ids is None or len(result_artifact_ids) == 0: return (None, None) result_artifact = result_artifact_ids[0] content = ArtifactsClient( workspace.service_context).download_artifact_contents_to_string( *result_artifact.split("/", 2)) try: profile = dataprep().DataProfile._from_json(content) except: raise RuntimeError('Profile result is corrupted.') if hasattr(result, 'is_up_to_date_error') and result.is_up_to_date_error: raise RuntimeError(result.is_up_to_date_error) if hasattr(result, 'is_up_to_date'): return (profile, result.is_up_to_date) return (profile, None)
def _engine_api(self): return dataprep().api.engineapi.api.get_engine_api()
def _validate_timestamp_columns(self, columns_list): FieldType = dataprep().api.engineapi.typedefinitions.FieldType columns = list(filter(lambda col: col is not None, columns_list)) _validate_has_columns(self._dataflow, columns, [FieldType.DATE for c in columns])