Ejemplo n.º 1
0
def test_get_column_logical_type(sample_series):
    assert isinstance(
        _get_column_logical_type(sample_series, None, "col_name"), Categorical
    )

    assert isinstance(
        _get_column_logical_type(sample_series, Datetime, "col_name"), Datetime
    )
Ejemplo n.º 2
0
    def init(self,
             logical_type=None,
             semantic_tags=None,
             use_standard_tags=True,
             description=None,
             metadata=None):
        """Initializes Woodwork typing information for a Series.

        Args:
            logical_type (LogicalType or str, optional): The logical type that should be assigned
                to the series. If no value is provided, the LogicalType for the series will
                be inferred. If the LogicalType provided or inferred does not have a dtype that
                is compatible with the series dtype, an error will be raised.
            semantic_tags (str or list or set, optional): Semantic tags to assign to the series.
                Defaults to an empty set if not specified. There are two options for
                specifying the semantic tags:
                (str) If only one semantic tag is being set, a single string can be passed.
                (list or set) If multiple tags are being set, a list or set of strings can be passed.
            use_standard_tags (bool, optional): If True, will add standard semantic tags to the series
                based on the inferred or specified logical type of the series. Defaults to True.
            description (str, optional): Optional text describing the contents of the series.
            metadata (dict[str -> json serializable], optional): Metadata associated with the series.
        """
        logical_type = _get_column_logical_type(self._series, logical_type,
                                                self._series.name)

        self._validate_logical_type(logical_type)

        self._schema = ColumnSchema(logical_type=logical_type,
                                    semantic_tags=semantic_tags,
                                    use_standard_tags=use_standard_tags,
                                    description=description,
                                    metadata=metadata)
Ejemplo n.º 3
0
def init_series(
    series,
    logical_type=None,
    semantic_tags=None,
    use_standard_tags=True,
    description=None,
    origin=None,
    metadata=None,
):
    """Initializes Woodwork typing information for a series, numpy.ndarray or pd.api.extensions.
    ExtensionArray, returning a new Series. The dtype of the returned series will be converted
    to match the dtype associated with the LogicalType.

    Args:
        series (pd.Series, dd.Series, ps.Series, numpy.ndarray or pd.api.extensions.ExtensionArray):
            The original series from which to create the Woodwork initialized series.
        logical_type (LogicalType or str, optional): The logical type that should be assigned
            to the series. If no value is provided, the LogicalType for the series will
            be inferred.
        semantic_tags (str or list or set, optional): Semantic tags to assign to the series.
            Defaults to an empty set if not specified. There are two options for
            specifying the semantic tags:
            (str) If only one semantic tag is being set, a single string can be passed.
            (list or set) If multiple tags are being set, a list or set of strings can be passed.
        use_standard_tags (bool, optional): If True, will add standard semantic tags to the series
            based on the inferred or specified logical type of the series. Defaults to True.
        description (str, optional): Optional text describing the contents of the series.
        origin (str, optional): Optional text specifying origin of the column (i.e. "base" or "engineered").
        metadata (dict[str -> json serializable], optional): Metadata associated with the series.

    Returns:
        Series: A series with Woodwork typing information initialized
    """
    if not _is_series(series):
        if (isinstance(series, (np.ndarray, pd.api.extensions.ExtensionArray))
                and series.ndim == 1):
            series = pd.Series(series)
        elif isinstance(series, np.ndarray) and series.ndim != 1:
            raise ValueError(
                f"np.ndarray input must be 1 dimensional. Current np.ndarray is {series.ndim} dimensional"
            )
        else:
            raise TypeError(
                f"Input must be of series type. The current input is of type {type(series)}"
            )
    logical_type = _get_column_logical_type(series, logical_type, series.name)
    new_series = logical_type.transform(series)
    new_series.ww.init(
        logical_type=logical_type,
        semantic_tags=semantic_tags,
        use_standard_tags=use_standard_tags,
        description=description,
        origin=origin,
        metadata=metadata,
    )
    return new_series
Ejemplo n.º 4
0
def init_series(series,
                logical_type=None,
                semantic_tags=None,
                use_standard_tags=True,
                description=None,
                metadata=None):
    """Initializes Woodwork typing information for a Series, returning a new Series. The dtype
    of the returned series will be converted to match the dtype associated with the LogicalType.

    Args:
        series (pd.Series, dd.Series, or ks.Series): The original series from which to create
            the Woodwork initialized series.
        logical_type (LogicalType or str, optional): The logical type that should be assigned
            to the series. If no value is provided, the LogicalType for the series will
            be inferred.
        semantic_tags (str or list or set, optional): Semantic tags to assign to the series.
            Defaults to an empty set if not specified. There are two options for
            specifying the semantic tags:
            (str) If only one semantic tag is being set, a single string can be passed.
            (list or set) If multiple tags are being set, a list or set of strings can be passed.
        use_standard_tags (bool, optional): If True, will add standard semantic tags to the series
            based on the inferred or specified logical type of the series. Defaults to True.
        description (str, optional): Optional text describing the contents of the series.
        metadata (dict[str -> json serializable], optional): Metadata associated with the series.

    Returns:
        Series: A series with Woodwork typing information initialized
    """
    logical_type = _get_column_logical_type(series, logical_type, series.name)

    new_series = _update_column_dtype(series, logical_type)
    new_series.ww.init(logical_type=logical_type,
                       semantic_tags=semantic_tags,
                       use_standard_tags=use_standard_tags,
                       description=description,
                       metadata=metadata)
    return new_series
Ejemplo n.º 5
0
    def init(self,
             index=None,
             time_index=None,
             logical_types=None,
             make_index=False,
             already_sorted=False,
             schema=None,
             validate=True,
             use_standard_tags=True,
             **kwargs):
        """Initializes Woodwork typing information for a DataFrame.

        Args:
            index (str, optional): Name of the index column.
            time_index (str, optional): Name of the time index column.
            logical_types (dict[str -> LogicalType]): Dictionary mapping column names in
                the DataFrame to the LogicalType for the column.
            make_index (bool, optional): If True, will create a new unique, numeric index column with the
                name specified by ``index`` and will add the new index column to the supplied DataFrame.
                If True, the name specified in ``index`` cannot match an existing column name in
                ``dataframe``. If False, the name is specified in ``index`` must match a column
                present in the ``dataframe``. Defaults to False.
            already_sorted (bool, optional): Indicates whether the input DataFrame is already sorted on the time
                index. If False, will sort the dataframe first on the time_index and then on the index (pandas DataFrame
                only). Defaults to False.
            name (str, optional): Name used to identify the DataFrame.
            semantic_tags (dict, optional): Dictionary mapping column names in Woodwork to the
                semantic tags for the column. The keys in the dictionary should be strings
                that correspond to column names. There are two options for
                specifying the dictionary values:
                (str): If only one semantic tag is being set, a single string can be used as a value.
                (list[str] or set[str]): If multiple tags are being set, a list or set of strings can be
                used as the value.
                Semantic tags will be set to an empty set for any column not included in the
                dictionary.
            table_metadata (dict[str -> json serializable], optional): Dictionary containing extra metadata for Woodwork.
            column_metadata (dict[str -> dict[str -> json serializable]], optional): Dictionary mapping column names
                to that column's metadata dictionary.
            use_standard_tags (bool, dict[str -> bool], optional): Determines whether standard semantic tags will be
                added to columns based on the specified logical type for the column.
                If a single boolean is supplied, will apply the same use_standard_tags value to all columns.
                A dictionary can be used to specify ``use_standard_tags`` values for individual columns.
                Unspecified columns will use the default value. Defaults to True.
            column_descriptions (dict[str -> str], optional): Dictionary mapping column names to column descriptions.
            schema (Woodwork.TableSchema, optional): Typing information to use for the DataFrame instead of performing inference.
                Any other arguments provided will be ignored. Note that any changes made to the schema object after
                initialization will propagate to the DataFrame. Similarly, to avoid unintended typing information changes,
                the same schema object should not be shared between DataFrames.
            validate (bool, optional): Whether parameter and data validation should occur. Defaults to True. Warning:
                Should be set to False only when parameters and data are known to be valid.
                Any errors resulting from skipping validation with invalid inputs may not be easily understood.
        """
        if validate:
            _validate_accessor_params(self._dataframe, index, make_index,
                                      time_index, logical_types, schema,
                                      use_standard_tags)
        if schema is not None:
            self._schema = schema
            extra_params = []
            if index is not None:
                extra_params.append('index')
            if make_index:
                extra_params.append('make_index')
            if time_index is not None:
                extra_params.append('time_index')
            if logical_types is not None:
                extra_params.append('logical_types')
            if already_sorted:
                extra_params.append('already_sorted')
            if not use_standard_tags or isinstance(use_standard_tags, dict):
                extra_params.append('use_standard_tags')
            for key in kwargs:
                extra_params.append(key)
            if extra_params:
                warnings.warn(
                    "A schema was provided and the following parameters were ignored: "
                    + ", ".join(extra_params), ParametersIgnoredWarning)

            # We need to store make_index on the Accessor when initializing with a schema
            # but we still should ignore any make_index value passed in here
            self.make_index = False
        else:
            self.make_index = make_index
            if make_index:
                _make_index(self._dataframe, index)

            # Perform type inference and update underlying data
            parsed_logical_types = {}
            for name in self._dataframe.columns:
                series = self._dataframe[name]

                logical_type = None
                if logical_types:
                    logical_type = logical_types.get(name)

                logical_type = _get_column_logical_type(
                    series, logical_type, name)
                parsed_logical_types[name] = logical_type

                updated_series = _update_column_dtype(series, logical_type)
                if updated_series is not series:
                    self._dataframe[name] = updated_series

            column_names = list(self._dataframe.columns)

            # TableSchema uses a different default for use_standard_tags so we need to define it here
            if isinstance(use_standard_tags, bool):
                use_standard_tags = {
                    col_name: use_standard_tags
                    for col_name in column_names
                }
            else:
                use_standard_tags = {
                    **{col_name: True
                       for col_name in column_names},
                    **use_standard_tags
                }

            self._schema = TableSchema(column_names=column_names,
                                       logical_types=parsed_logical_types,
                                       index=index,
                                       time_index=time_index,
                                       validate=validate,
                                       use_standard_tags=use_standard_tags,
                                       **kwargs)

            self._set_underlying_index()
            if self._schema.time_index is not None:
                self._sort_columns(already_sorted)
Ejemplo n.º 6
0
def test_get_column_logical_type(sample_series):
    assert _get_column_logical_type(sample_series, None,
                                    'col_name') == Categorical

    assert _get_column_logical_type(sample_series, Datetime,
                                    'col_name') == Datetime
Ejemplo n.º 7
0
    def init(self,
             logical_type=None,
             semantic_tags=None,
             use_standard_tags=True,
             description=None,
             metadata=None,
             schema=None,
             validate=True):
        """Initializes Woodwork typing information for a Series.

        Args:
            logical_type (LogicalType or str, optional): The logical type that should be assigned
                to the series. If no value is provided, the LogicalType for the series will
                be inferred. If the LogicalType provided or inferred does not have a dtype that
                is compatible with the series dtype, an error will be raised.
            semantic_tags (str or list or set, optional): Semantic tags to assign to the series.
                Defaults to an empty set if not specified. There are two options for
                specifying the semantic tags:
                (str) If only one semantic tag is being set, a single string can be passed.
                (list or set) If multiple tags are being set, a list or set of strings can be passed.
            use_standard_tags (bool, optional): If True, will add standard semantic tags to the series
                based on the inferred or specified logical type of the series. Defaults to True.
            description (str, optional): Optional text describing the contents of the series.
            metadata (dict[str -> json serializable], optional): Metadata associated with the series.
            schema (Woodwork.ColumnSchema, optional): Typing information to use for the Series instead of performing inference.
                Any other arguments provided will be ignored. Note that any changes made to the schema object after
                initialization will propagate to the Series. Similarly, to avoid unintended typing information changes,
                the same schema object should not be shared between Series.
            validate (bool, optional): Whether parameter and data validation should occur. Defaults to True. Warning:
                Should be set to False only when parameters and data are known to be valid.
                Any errors resulting from skipping validation with invalid inputs may not be easily understood.
        """

        if schema is not None:
            if validate:
                _validate_schema(schema, self._series)

            extra_params = []
            if logical_type is not None:
                extra_params.append('logical_type')
            if semantic_tags is not None:
                extra_params.append('semantic_tags')
            if description is not None:
                extra_params.append('description')
            if metadata is not None:
                extra_params.append('metadata')
            if not use_standard_tags:
                extra_params.append('use_standard_tags')
            if extra_params:
                warnings.warn(
                    "A schema was provided and the following parameters were ignored: "
                    + ", ".join(extra_params), ParametersIgnoredWarning)

            self._schema = schema
        else:
            logical_type = _get_column_logical_type(self._series, logical_type,
                                                    self._series.name)

            if validate:
                self._validate_logical_type(logical_type)

            self._schema = ColumnSchema(logical_type=logical_type,
                                        semantic_tags=semantic_tags,
                                        use_standard_tags=use_standard_tags,
                                        description=description,
                                        metadata=metadata,
                                        validate=validate)
Ejemplo n.º 8
0
    def init(
        self,
        logical_type=None,
        semantic_tags=None,
        use_standard_tags=True,
        description=None,
        origin=None,
        metadata=None,
        schema=None,
        validate=True,
    ):
        """Initializes Woodwork typing information for a Series.

        Args:
            logical_type (LogicalType or str, optional): The logical type that should be assigned
                to the series. If no value is provided, the LogicalType for the series will
                be inferred. If the LogicalType provided or inferred does not have a dtype that
                is compatible with the series dtype, an error will be raised.
            semantic_tags (str or list or set, optional): Semantic tags to assign to the series.
                Defaults to an empty set if not specified. There are two options for
                specifying the semantic tags:
                (str) If only one semantic tag is being set, a single string can be passed.
                (list or set) If multiple tags are being set, a list or set of strings can be passed.
            use_standard_tags (bool, optional): If True, will add standard semantic tags to the series
                based on the inferred or specified logical type of the series. Defaults to True.
            description (str, optional): Optional text describing the contents of the series.
            origin (str, optional): Optional text specifying origin of the column (i.e. "base" or "engineered").
            metadata (dict[str -> json serializable], optional): Metadata associated with the series.
            schema (Woodwork.ColumnSchema, optional): Typing information to use for the Series instead of performing inference.
                Any other arguments provided will be ignored. Note that any changes made to the schema object after
                initialization will propagate to the Series. Similarly, to avoid unintended typing information changes,
                the same schema object should not be shared between Series.
            validate (bool, optional): Whether parameter and data validation should occur. Defaults to True. Warning:
                Should be set to False only when parameters and data are known to be valid.
                Any errors resulting from skipping validation with invalid inputs may not be easily understood.
        """

        if schema is not None:
            if validate:
                _validate_schema(schema, self._series)

            extra_params = []
            if logical_type is not None:
                extra_params.append("logical_type")
            if semantic_tags is not None:
                extra_params.append("semantic_tags")
            if description is not None:
                extra_params.append("description")
            if origin is not None:
                extra_params.append("origin")
            if metadata is not None:
                extra_params.append("metadata")
            if not use_standard_tags:
                extra_params.append("use_standard_tags")
            if extra_params:
                warnings.warn(
                    "A schema was provided and the following parameters were ignored: "
                    + ", ".join(extra_params),
                    ParametersIgnoredWarning,
                )

            self._schema = schema
        else:
            logical_type = _get_column_logical_type(self._series, logical_type,
                                                    self._series.name)

            if validate:
                if isinstance(logical_type, (Ordinal, LatLong)):
                    logical_type.validate(self._series)
                else:
                    valid_dtype = logical_type._get_valid_dtype(
                        type(self._series))
                    if not _check_data_type_equality(valid_dtype,
                                                     str(self._series.dtype)):
                        raise TypeValidationError(
                            f"Cannot initialize Woodwork. Series dtype '{self._series.dtype}' is "
                            f"incompatible with {logical_type} dtype. Try converting series "
                            f"dtype to '{valid_dtype}' before initializing or use the "
                            "woodwork.init_series function to initialize.")

            self._schema = ColumnSchema(
                logical_type=logical_type,
                semantic_tags=semantic_tags,
                use_standard_tags=use_standard_tags,
                description=description,
                origin=origin,
                metadata=metadata,
                validate=validate,
            )