Beispiel #1
0
    def time_between(self,
                     start_time,
                     end_time,
                     include_boundary=True,
                     validate=True):
        """Filter TabularDataset between a specified start and end time.

        :param start_time: The Lower bound for filtering data.
        :type start_time: datetime.datetime
        :param end_time: The upper bound for filtering data.
        :type end_time: datetime.datetime
        :param include_boundary: Indicate if the row associated with the boundary time (``start_end`` and
            ``end_time``) should be included.
        :type include_boundary: bool
        :param validate: Indicates whether to validate if specified columns exist in dataset. The default is True.
            Validation requires that the data source is accessible from the current compute.
        :type validate: bool
        :return: A TabularDataset with the new filtered dataset.
        :rtype: azureml.data.TabularDataset
        """
        if self._registration and self._registration.workspace:
            collect_datasets_usage(_get_logger(), _TIMESERIES_BETWEEN_ACTIVITY,
                                   [self], self._registration.workspace, "N/A")

        return self._time_filter(self.time_between.__name__,
                                 lower_bound=start_time,
                                 upper_bound=end_time,
                                 include_boundary=include_boundary,
                                 validate=validate)
Beispiel #2
0
    def get_partition_key_values(self, partition_keys=None):
        """Return unique key values of partition_keys.

        validate if partition_keys is a valid subset of full set of partition keys, return unique key values of
        partition_keys, default to return the unique key combinations by taking the full set of partition keys of this
        dataset if partition_keys is None

        .. code-block:: python

            # get all partition key value pairs
            partitions = ds.get_partition_key_values()
            # Return [{'country': 'US', 'state': 'WA', 'partition_date': datetime('2020-1-1')}]

            partitions = ds.get_partition_key_values(['country'])
            # Return [{'country': 'US'}]

        :param partition_keys: partition keys
        :type partition_keys: builtin.list[str]
        """
        import time
        starting_time = time.process_time()

        if not self.partition_keys or len(self.partition_keys) == 0:
            raise UserErrorException(
                "get_partition_key_values is not available to a dataset that has no "
                "partition keys")

        if not partition_keys:
            partition_keys = self.partition_keys

        invalid_keys = []
        for key in partition_keys:
            if key not in self.partition_keys:
                invalid_keys.append(key)
        if len(invalid_keys) != 0:
            raise UserErrorException(
                "{0} are invalid partition keys".format(invalid_keys))

        dataflow = self._dataflow.keep_columns(partition_keys)
        for step in dataflow._steps:
            if step.step_type == 'Microsoft.DPrep.ReadParquetFileBlock' or \
                    step.step_type == 'Microsoft.DPrep.ParseDelimitedBlock' or \
                    step.step_type == 'Microsoft.DPrep.ParseJsonLinesBlock':
                dataflow._steps.remove(step)
        dataflow = dataflow.distinct_rows()
        pd = dataflow.to_pandas_dataframe()
        partition_key_values = pd[partition_keys].to_dict(
            orient='records') if pd.shape[0] != 0 else []

        if self._registration and self._registration.workspace:
            collect_datasets_usage(
                _get_logger(), _PATITION_KEY_VALUES_ACTIVITY, [self],
                self._registration.workspace, "{}", {
                    "execution_time": time.process_time() - starting_time,
                    "number_of_partition_keys": len(partition_keys)
                })
        return partition_key_values
Beispiel #3
0
def submit(script_run_config,
           workspace,
           experiment_name,
           run_id=None,
           _parent_run_id=None,
           credential_passthrough=False):
    """Submit and return a script run.

    This function creates an :class:`azureml.core.Experiment`, applies the run configuration,
    submits the run, and returns a :class:`azureml.core.script_run.ScriptRun` object.

    :param script_run_config: The configuration information for the run.
    :type script_run_config:  azureml.core.script_run_config.ScriptRunConfig
    :param workspace: A workspace in which to create the experiment.
    :type workspace: azureml.core.workspace.Workspace
    :param experiment_name: The name of the experiment.
    :type experiment_name: str
    :param run_id: An optional ID of the run.
    :type run_id: str
    :param _parent_run_id: Internal use only.
    :type _parent_run_id: str
    :param credential_passthrough: Allow remote compute to run with user's credentials.
    :type credential_passthrough: bool
    :return: A script run object.
    :rtype: azureml.core.script_run.ScriptRun
    """
    from azureml.core import Experiment
    from azureml._execution import _commands
    from azureml._project.project import Project

    experiment = Experiment(workspace, experiment_name, _create_in_cloud=False)
    project = Project(directory=script_run_config.source_directory,
                      experiment=experiment)

    run_config = get_run_config_from_script_run(script_run_config)
    run_config.credential_passthrough = credential_passthrough

    inputs, _ = _update_args_and_io(workspace, run_config)
    collect_datasets_usage(module_logger, _SCRIPT_RUN_SUBMIT_ACTIVITY, inputs,
                           workspace, run_config.target)
    run = _commands.start_run(
        project,
        run_config,
        telemetry_values=script_run_config._telemetry_values,
        run_id=run_id,
        parent_run_id=_parent_run_id)
    run.add_properties(
        global_tracking_info_registry.gather_all(
            script_run_config.source_directory))

    return run
Beispiel #4
0
    def time_recent(self, time_delta, include_boundary=True, validate=True):
        """Filter TabularDataset to contain only the specified duration (amount) of recent data.

        :param time_delta: The duration (amount) of recent data to retrieve.
        :type time_delta: datetime.timedelta
        :param include_boundary: Indicate if the row associated with the boundary time (``time_delta``)
            should be included.
        :type include_boundary: bool
        :param validate: Indicates whether to validate if specified columns exist in dataset. The default is True.
            Validation requires that the data source is accessible from the current compute.
        :type validate: bool
        :return: A TabularDataset with the new filtered dataset.
        :rtype: azureml.data.TabularDataset
        """
        if self._registration and self._registration.workspace:
            collect_datasets_usage(_get_logger(), _TIMESERIES_RECENT_ACTIVITY,
                                   [self], self._registration.workspace, "N/A")

        start_time = datetime.now() - time_delta
        return self._time_filter(self.time_recent.__name__,
                                 lower_bound=start_time,
                                 include_boundary=include_boundary,
                                 validate=validate)
Beispiel #5
0
    def with_timestamp_columns(self,
                               timestamp=None,
                               partition_timestamp=None,
                               validate=False,
                               **kwargs):
        """Define timestamp columns for the dataset.

        .. remarks::

            The method defines columns to be used as timestamps. Timestamp columns on a dataset make it possible
            to treat the data as time-series data and enable additional capabilities. When a dataset has
            both ``timestamp (used to be referred as fine_grain_timestamp)`` and ``partition_timestamp (used to be
            referred as coarse grain timestamp)`` specified, the two columns should represent the same timeline.

        :param timestamp: The name of column as timestamp (used to be referred as fine_grain_timestamp) (optional).
            The default is None(clear).
        :type timestamp: str
        :param partition_timestamp: The name of column partition_timestamp (used to be referred as coarse grain
            timestamp) (optional). The default is None(clear).
        :type partition_timestamp: str
        :param validate: Indicates whether to validate if specified columns exist in dataset. The default is False.
            Validation requires that the data source is accessible from the current compute.
        :type validate: bool
        :return: Returns a new TabularDataset with timestamp columns defined.
        :rtype: azureml.data.TabularDataset
        """
        fine_grain_timestamp = kwargs.get(_DEPRECATED_TIMESTAMP_NAME, None)
        coarse_grain_timestamp = kwargs.get(
            _DEPRECATED_PARTITION_TIMESTAMP_NAME, None)
        if fine_grain_timestamp:
            warnings.warn("fine_grain_timestamp is deprecated, use timestamp.",
                          DeprecationWarning)
        if coarse_grain_timestamp:
            warnings.warn(
                "coarse_grain_timestamp is deprecated, use partition_timestamp.",
                DeprecationWarning)
        if (timestamp or partition_timestamp) and (fine_grain_timestamp
                                                   or coarse_grain_timestamp):
            raise UserErrorException(
                'fine_grain_timestamp and coarse_grain_timestamp have been replaced by '
                'timestamp and partition_timestamp parameters and cannot be used together.'
            )
        if not timestamp and partition_timestamp:
            raise UserErrorException(
                'partition_timestamp can be assigned only if timestamp is assigned.'
            )
        if timestamp and timestamp == partition_timestamp:
            raise UserErrorException(
                'partition_timestamp cannot be the same as timestamp.')
        if not fine_grain_timestamp and coarse_grain_timestamp:
            raise UserErrorException(
                'coarse_grain_timestamp can be assigned only if fine_grain_timestamp is '
                'assigned.')
        if fine_grain_timestamp and fine_grain_timestamp == coarse_grain_timestamp:
            raise UserErrorException(
                'coarse_grain_timestamp cannot be the same as fine_grain_timestamp.'
            )
        if validate:
            self._validate_timestamp_columns(
                [fine_grain_timestamp, coarse_grain_timestamp])
        if timestamp:
            fine_grain_timestamp = timestamp
            coarse_grain_timestamp = partition_timestamp

        if self._registration and self._registration.workspace:
            collect_datasets_usage(_get_logger(),
                                   _TIMESERIES_WITH_TIMESTAMP_COLUMN_ACTIVITY,
                                   [self], self._registration.workspace, "N/A")

        dataset = TabularDataset._create(self._dataflow,
                                         self._properties,
                                         telemetry_info=self._telemetry_info)

        if fine_grain_timestamp:
            dataset._properties[
                _DATASET_PROP_TIMESTAMP_FINE] = fine_grain_timestamp
        else:
            if _DATASET_PROP_TIMESTAMP_FINE in self._properties:
                del dataset._properties[_DATASET_PROP_TIMESTAMP_FINE]

        if coarse_grain_timestamp:
            dataset._properties[
                _DATASET_PROP_TIMESTAMP_COARSE] = coarse_grain_timestamp
        else:
            if _DATASET_PROP_TIMESTAMP_COARSE in self._properties:
                del dataset._properties[_DATASET_PROP_TIMESTAMP_COARSE]
        return dataset
Beispiel #6
0
    def partition_by(self,
                     partition_keys,
                     target,
                     name=None,
                     show_progress=True,
                     partition_as_file_dataset=False):
        """Partitioned data will be copied and output to the destination specified by target.

        create the dataset from the outputted data path with partition format, register dataset if name is provided,
        return the dataset for the new data path with partitions

        .. code-block:: python

            ds = Dataset.get_by_name('test') # indexed by country, state, partition_date

            # #1: call partition_by locally
            new_ds = ds.partition_by(name="repartitioned_ds", partition_keys=['country'],
                        target=DataPath(datastore, "repartition"))
            partition_keys = newds.partition_keys # ['country']

            # new_ds can be passed to PRS as input dataset

        :param partition_keys: Required, partition keys
        :type partition_keys: builtin.list[str]
        :param target: Required, the datastore path where the dataframe parquet data will be uploaded to.
            A guid folder will be generated under the target path to avoid conflict.
        :type target: azureml.data.datapath.DataPath, azureml.core.datastore.Datastore
            or tuple(azureml.core.datastore.Datastore, str) object
        :param name: Optional, The registration name.
        :type name: str
        :param show_progress: Optional, indicates whether to show progress of the upload in the console.
            Defaults to be True.
        :type show_progress: bool
        :param partition_as_file_dataset: Optional, indicates whether returns a filedataset or not.
            Defaults to be False.
        :type show_progress: bool
        :return: The saved or registered dataset.
        :rtype: azureml.data.TabularDataset
        """
        from uuid import uuid4
        from azureml.exceptions import UserErrorException
        from azureml.core import Dataset
        from azureml.data.data_reference import DataReference
        from azureml.data._dataset_factory_helper import get_progress_logger, parse_target
        from azureml.dataprep import FieldType
        from azureml.data.dataset_factory import TabularDatasetFactory

        import time
        starting_time = time.process_time()

        console = get_progress_logger(show_progress)
        console("Validating arguments.")
        if len(partition_keys) == 0:
            raise UserErrorException("partition_keys cannot be empty")

        column_types = self._dataflow.dtypes
        invalid_keys = []
        for key in partition_keys:
            if key not in column_types:
                invalid_keys.append(key)
        if len(invalid_keys) != 0:
            raise UserErrorException(
                "{0} are invalid partition keys".format(invalid_keys))

        if len(partition_keys) != len(set(partition_keys)):
            raise UserErrorException("partition_keys cannot have duplicates")
        console("Arguments validated.")

        guid = uuid4()
        datastore, relative_path = parse_target(target)
        relative_path_with_guid = "/%s/%s/" % (relative_path, guid)

        partition_format = relative_path_with_guid
        partition_path = relative_path_with_guid
        saved_dataset_key_column_types = {}

        for key in partition_keys:
            if column_types[key] == FieldType.DATE:
                partition_format = partition_format + '{' + key + ':yyyyMMddHHmmss}*/'
                del column_types[key]
            else:
                partition_format = partition_format + '{' + key + '}/'
            partition_path = partition_path + '*/'
            if key in column_types:
                saved_dataset_key_column_types[key] = column_types[key]

        partition_format = partition_format + '*.parquet'
        partition_path = partition_path + '*.parquet'

        console("Uploading file to {}".format(relative_path_with_guid))

        self._dataflow.write_to_parquet(
            partition_keys=partition_keys,
            directory_path=DataReference(datastore=datastore).path(
                relative_path_with_guid)).run_local()
        console("Successfully uploaded file to datastore.")

        console("Creating a new dataset.")
        if partition_as_file_dataset:
            saved_dataset = Dataset.File.\
                from_files(path=(datastore, partition_path), partition_format=partition_format)
        else:
            saved_dataset = TabularDatasetFactory.\
                from_parquet_files(path=(datastore, partition_path), partition_format=partition_format)
        saved_dataset = TabularDataset._create(
            saved_dataset._dataflow.set_column_types(
                saved_dataset_key_column_types),
            self._properties,
            telemetry_info=self._telemetry_info)

        console("Successfully created a new dataset.")

        if self._registration and self._registration.workspace:
            collect_datasets_usage(
                _get_logger(), _PATITION_BY_ACTIVITY, [self],
                self._registration.workspace, "N/A", {
                    "execution_time": time.process_time() - starting_time,
                    "number_of_partition_keys": len(partition_keys)
                })

        if name is None:
            return saved_dataset
        console("registering a new dataset.")
        registered_dataset = saved_dataset.register(datastore.workspace,
                                                    name,
                                                    create_new_version=True)
        console("Successfully created and registered a new dataset.")
        return registered_dataset