class WoodworkColumnAccessor:
    def __init__(self, series):
        self._series = series
        self._schema = None

    def init(self,
             logical_type=None,
             semantic_tags=None,
             use_standard_tags=True,
             description=None,
             metadata=None,
             schema=None,
             validate=True):
        """Initializes Woodwork typing information for a Series.

        Args:
            logical_type (LogicalType or str, optional): The logical type that should be assigned
                to the series. If no value is provided, the LogicalType for the series will
                be inferred. If the LogicalType provided or inferred does not have a dtype that
                is compatible with the series dtype, an error will be raised.
            semantic_tags (str or list or set, optional): Semantic tags to assign to the series.
                Defaults to an empty set if not specified. There are two options for
                specifying the semantic tags:
                (str) If only one semantic tag is being set, a single string can be passed.
                (list or set) If multiple tags are being set, a list or set of strings can be passed.
            use_standard_tags (bool, optional): If True, will add standard semantic tags to the series
                based on the inferred or specified logical type of the series. Defaults to True.
            description (str, optional): Optional text describing the contents of the series.
            metadata (dict[str -> json serializable], optional): Metadata associated with the series.
            schema (Woodwork.ColumnSchema, optional): Typing information to use for the Series instead of performing inference.
                Any other arguments provided will be ignored. Note that any changes made to the schema object after
                initialization will propagate to the Series. Similarly, to avoid unintended typing information changes,
                the same schema object should not be shared between Series.
            validate (bool, optional): Whether parameter and data validation should occur. Defaults to True. Warning:
                Should be set to False only when parameters and data are known to be valid.
                Any errors resulting from skipping validation with invalid inputs may not be easily understood.
        """

        if schema is not None:
            if validate:
                _validate_schema(schema, self._series)

            extra_params = []
            if logical_type is not None:
                extra_params.append('logical_type')
            if semantic_tags is not None:
                extra_params.append('semantic_tags')
            if description is not None:
                extra_params.append('description')
            if metadata is not None:
                extra_params.append('metadata')
            if not use_standard_tags:
                extra_params.append('use_standard_tags')
            if extra_params:
                warnings.warn(
                    "A schema was provided and the following parameters were ignored: "
                    + ", ".join(extra_params), ParametersIgnoredWarning)

            self._schema = schema
        else:
            logical_type = _get_column_logical_type(self._series, logical_type,
                                                    self._series.name)

            if validate:
                self._validate_logical_type(logical_type)

            self._schema = ColumnSchema(logical_type=logical_type,
                                        semantic_tags=semantic_tags,
                                        use_standard_tags=use_standard_tags,
                                        description=description,
                                        metadata=metadata,
                                        validate=validate)

    @property
    def schema(self):
        return copy.deepcopy(self._schema)

    @property
    def description(self):
        """The description of the series"""
        if self._schema is None:
            _raise_init_error()
        return self._schema.description

    @description.setter
    def description(self, description):
        if self._schema is None:
            _raise_init_error()
        _validate_description(description)
        self._schema.description = description

    @property
    def iloc(self):
        """
        Integer-location based indexing for selection by position.
        ``.iloc[]`` is primarily integer position based (from ``0`` to
        ``length-1`` of the axis), but may also be used with a boolean array.

        If the selection result is a Series, Woodwork typing information will
        be initialized for the returned Series.

        Allowed inputs are:
            An integer, e.g. ``5``.
            A list or array of integers, e.g. ``[4, 3, 0]``.
            A slice object with ints, e.g. ``1:7``.
            A boolean array.
            A ``callable`` function with one argument (the calling Series, DataFrame
            or Panel) and that returns valid output for indexing (one of the above).
            This is useful in method chains, when you don't have a reference to the
            calling object, but would like to base your selection on some value.
        """
        if self._schema is None:
            _raise_init_error()
        return _iLocIndexer(self._series)

    @property
    def loc(self):
        """
        Access a group of rows by label(s) or a boolean array.

        ``.loc[]`` is primarily label based, but may also be used with a
        boolean array.

        If the selection result is a Series, Woodwork typing information will
        be initialized for the returned Series.

        Allowed inputs are:
            A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is
            interpreted as a *label* of the index, and **never** as an
            integer position along the index).
            A list or array of labels, e.g. ``['a', 'b', 'c']``.
            A slice object with labels, e.g. ``'a':'f'``.
            A boolean array of the same length as the axis being sliced,
            e.g. ``[True, False, True]``.
            An alignable boolean Series. The index of the key will be aligned before
            masking.
            An alignable Index. The Index of the returned selection will be the input.
            A ``callable`` function with one argument (the calling Series or
            DataFrame) and that returns valid output for indexing (one of the above)
        """
        if self._schema is None:
            _raise_init_error()
        return _locIndexer(self._series)

    @property
    def logical_type(self):
        """The logical type of the series"""
        if self._schema is None:
            _raise_init_error()
        return self._schema.logical_type

    @property
    def metadata(self):
        """The metadata of the series"""
        if self._schema is None:
            _raise_init_error()
        return self._schema.metadata

    @metadata.setter
    def metadata(self, metadata):
        if self._schema is None:
            _raise_init_error()
        _validate_metadata(metadata)
        self._schema.metadata = metadata

    @property
    def semantic_tags(self):
        """The semantic tags assigned to the series"""
        if self._schema is None:
            _raise_init_error()
        return self._schema.semantic_tags

    @property
    def use_standard_tags(self):
        if self._schema is None:
            _raise_init_error()
        return self._schema.use_standard_tags

    def __eq__(self, other):
        if self._schema != other._schema:
            return False
        if self._series.name != other._series.name:
            return False
        if isinstance(self._series, pd.Series):
            return self._series.equals(other._series)
        return True

    def __getattr__(self, attr):
        # If the method is present on the Accessor, uses that method.
        # If the method is present on Series, uses that method.
        if self._schema is None:
            _raise_init_error()
        if hasattr(self._series, attr):
            return self._make_series_call(attr)
        else:
            raise AttributeError(f"Woodwork has no attribute '{attr}'")

    def __repr__(self):
        if self._schema is None:
            _raise_init_error()
        msg = u"<Series: {} ".format(self._series.name)
        msg += u"(Physical Type = {}) ".format(self._series.dtype)
        msg += u"(Logical Type = {}) ".format(self.logical_type)
        msg += u"(Semantic Tags = {})>".format(self.semantic_tags)
        return msg

    def _make_series_call(self, attr):
        """Forwards the requested attribute onto the series object. Intercepts return value,
        attempting to initialize Woodwork with the current schema when a new Series is returned.
        Confirms schema is still valid for the original Series."""
        series_attr = getattr(self._series, attr)

        if callable(series_attr):

            def wrapper(*args, **kwargs):
                # Make Series call and intercept the result
                result = series_attr(*args, **kwargs)

                # Try to initialize Woodwork with the existing schema
                if _is_series(result):
                    valid_dtype = _get_valid_dtype(type(result),
                                                   self._schema.logical_type)
                    if str(result.dtype) == valid_dtype:
                        result.ww.init(schema=self.schema, validate=False)
                    else:
                        invalid_schema_message = 'dtype mismatch between original dtype, ' \
                            f'{valid_dtype}, and returned dtype, {result.dtype}'
                        warning_message = TypingInfoMismatchWarning(
                        ).get_warning_message(attr, invalid_schema_message,
                                              'Series')
                        warnings.warn(warning_message,
                                      TypingInfoMismatchWarning)
                # Always return the results of the Series operation whether or not Woodwork is initialized
                return result

            return wrapper
        # Directly return non-callable Series attributes
        return series_attr

    def _validate_logical_type(self, logical_type):
        """Validates that a logical type is consistent with the series dtype. Performs additional type
        specific validation, as required."""
        valid_dtype = _get_valid_dtype(type(self._series), logical_type)
        if valid_dtype != str(self._series.dtype):
            raise ValueError(
                f"Cannot initialize Woodwork. Series dtype '{self._series.dtype}' is "
                f"incompatible with {logical_type} dtype. Try converting series "
                f"dtype to '{valid_dtype}' before initializing or use the "
                "woodwork.init_series function to initialize.")

        if isinstance(logical_type, Ordinal):
            logical_type._validate_data(self._series)
        elif logical_type == LatLong:
            if not _is_valid_latlong_series(self._series):
                raise ValueError(
                    "Cannot initialize Woodwork. Series does not contain properly formatted "
                    "LatLong data. Try reformatting before initializing or use the "
                    "woodwork.init_series function to initialize.")

    def add_semantic_tags(self, semantic_tags):
        """Add the specified semantic tags to the set of tags.

        Args:
            semantic_tags (str/list/set): New semantic tag(s) to add
        """
        if self._schema is None:
            _raise_init_error()
        self._schema._add_semantic_tags(semantic_tags, self._series.name)

    def remove_semantic_tags(self, semantic_tags):
        """Removes specified semantic tags from the current tags.

        Args:
            semantic_tags (str/list/set): Semantic tag(s) to remove.
        """
        if self._schema is None:
            _raise_init_error()
        self._schema._remove_semantic_tags(semantic_tags, self._series.name)

    def reset_semantic_tags(self):
        """Reset the semantic tags to the default values. The default values
        will be either an empty set or a set of the standard tags based on the
        column logical type, controlled by the use_standard_tags property.

        Args:
            None
        """
        if self._schema is None:
            _raise_init_error()
        self._schema._reset_semantic_tags()

    def set_logical_type(self, logical_type):
        """Update the logical type for the series, clearing any previously set semantic tags,
        and returning a new series with Woodwork initialied.

        Args:
            logical_type (LogicalType, str): The new logical type to set for the series.

        Returns:
            Series: A new series with the updated logical type.
        """
        if self._schema is None:
            _raise_init_error()
        # Create a new series without a schema to prevent new series from sharing a common
        # schema with current series
        new_series = self._series.copy()
        new_series._schema = None
        return init_series(new_series,
                           logical_type=logical_type,
                           semantic_tags=None,
                           use_standard_tags=self._schema.use_standard_tags,
                           description=self.description,
                           metadata=copy.deepcopy(self.metadata))

    def set_semantic_tags(self, semantic_tags):
        """Replace current semantic tags with new values. If `use_standard_tags` is set
        to True for the series, any standard tags associated with the LogicalType of the
        series will be added as well.

        Args:
            semantic_tags (str/list/set): New semantic tag(s) to set
        """
        if self._schema is None:
            _raise_init_error()
        self._schema._set_semantic_tags(semantic_tags)
 class MaxAboveTen(AggregationPrimitive):
     name = "max_above_ten"
     input_types = [ColumnSchema(semantic_tags={"numeric"})]
     return_type = ColumnSchema(semantic_tags={"numeric"})
Exemple #3
0
class DirectFeature(FeatureBase):
    """Feature for child dataframe that inherits
    a feature value from a parent dataframe"""

    input_types = [ColumnSchema()]
    return_type = None

    def __init__(
        self, base_feature, child_dataframe_name, relationship=None, name=None
    ):
        base_feature = _validate_base_features(base_feature)[0]
        self.parent_dataframe_name = base_feature.dataframe_name
        relationship = self._handle_relationship(
            base_feature.entityset, child_dataframe_name, relationship
        )
        child_dataframe = base_feature.entityset[child_dataframe_name]
        super(DirectFeature, self).__init__(
            dataframe=child_dataframe,
            base_features=[base_feature],
            relationship_path=RelationshipPath([(True, relationship)]),
            primitive=PrimitiveBase,
            name=name,
        )

    def _handle_relationship(self, entityset, child_dataframe_name, relationship):
        child_dataframe = entityset[child_dataframe_name]
        if relationship:
            relationship_child = relationship.child_dataframe
            assert (
                child_dataframe.ww.name == relationship_child.ww.name
            ), "child_dataframe must be the relationship child dataframe"

            assert (
                self.parent_dataframe_name == relationship.parent_dataframe.ww.name
            ), "Base feature must be defined on the relationship parent dataframe"
        else:
            child_relationships = entityset.get_forward_relationships(
                child_dataframe.ww.name
            )
            possible_relationships = (
                r
                for r in child_relationships
                if r.parent_dataframe.ww.name == self.parent_dataframe_name
            )
            relationship = next(possible_relationships, None)

            if not relationship:
                raise RuntimeError(
                    'No relationship from "%s" to "%s" found.'
                    % (child_dataframe.ww.name, self.parent_dataframe_name)
                )

            # Check for another path.
            elif next(possible_relationships, None):
                message = (
                    "There are multiple relationships to the base dataframe. "
                    "You must specify a relationship."
                )
                raise RuntimeError(message)

        return relationship

    @classmethod
    def from_dictionary(cls, arguments, entityset, dependencies, primitive):
        base_feature = dependencies[arguments["base_feature"]]
        relationship = Relationship.from_dictionary(
            arguments["relationship"], entityset
        )
        child_dataframe_name = relationship.child_dataframe.ww.name
        return cls(
            base_feature=base_feature,
            child_dataframe_name=child_dataframe_name,
            relationship=relationship,
            name=arguments["name"],
        )

    @property
    def number_output_features(self):
        return self.base_features[0].number_output_features

    @property
    def default_value(self):
        return self.base_features[0].default_value

    def copy(self):
        """Return copy of feature"""
        _is_forward, relationship = self.relationship_path[0]
        return DirectFeature(
            self.base_features[0], self.dataframe_name, relationship=relationship
        )

    @property
    def column_schema(self):
        return self.base_features[0].column_schema

    def generate_name(self):
        return self._name_from_base(self.base_features[0].get_name())

    def generate_names(self):
        return [
            self._name_from_base(base_name)
            for base_name in self.base_features[0].get_feature_names()
        ]

    def get_arguments(self):
        _is_forward, relationship = self.relationship_path[0]
        return {
            "name": self.get_name(),
            "base_feature": self.base_features[0].unique_name(),
            "relationship": relationship.to_dictionary(),
        }

    def _name_from_base(self, base_name):
        return "%s.%s" % (self.relationship_path_name(), base_name)
Exemple #4
0
def test_column_schema_params():
    column = ColumnSchema(logical_type=Integer, description='this is a column!', metadata={'created_by': 'user1'})

    assert column.description == 'this is a column!'
    assert column.metadata == {'created_by': 'user1'}
Exemple #5
0
 class CustomTrans(TransformPrimitive):
     name = "custom_transform"
     input_types = [ColumnSchema(semantic_tags={"category"})]
     output_type = ColumnSchema(semantic_tags={"category"})
def test_is_valid_input():
    assert is_valid_input(candidate=ColumnSchema(), template=ColumnSchema())

    assert is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={'index'}),
                          template=ColumnSchema(logical_type=Integer, semantic_tags={'index'}))

    assert is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={'index', 'numeric'}),
                          template=ColumnSchema(semantic_tags={'index'}))

    assert is_valid_input(candidate=ColumnSchema(semantic_tags={'index'}),
                          template=ColumnSchema(semantic_tags={'index'}))

    assert is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={'index'}),
                          template=ColumnSchema())

    assert is_valid_input(candidate=ColumnSchema(logical_type=Integer),
                          template=ColumnSchema(logical_type=Integer))

    assert is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={'numeric'}),
                          template=ColumnSchema(logical_type=Integer))

    assert not is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={'index'}),
                              template=ColumnSchema(logical_type=Double, semantic_tags={'index'}))

    assert not is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={}),
                              template=ColumnSchema(logical_type=Integer, semantic_tags={'index'}))

    assert not is_valid_input(candidate=ColumnSchema(),
                              template=ColumnSchema(logical_type=Integer, semantic_tags={'index'}))

    assert not is_valid_input(candidate=ColumnSchema(),
                              template=ColumnSchema(logical_type=Integer))

    assert not is_valid_input(candidate=ColumnSchema(),
                              template=ColumnSchema(semantic_tags={'index'}))
Exemple #7
0
def test_schema_equality():
    col = ColumnSchema(logical_type=Categorical)
    diff_description_col = ColumnSchema(logical_type=Categorical, description='description')
    diff_metadata_col = ColumnSchema(logical_type=Categorical, metadata={'interesting_values': ['a', 'b']})
    use_standard_tags_col = ColumnSchema(logical_type=Categorical, use_standard_tags=True)
    diff_tags_col = ColumnSchema(logical_type=Categorical, semantic_tags={'new_tag'})

    assert col != diff_description_col
    assert col != diff_metadata_col
    assert col != use_standard_tags_col
    assert col != diff_tags_col

    # Check columns with same logical types but different parameters
    ordinal_ltype_1 = Ordinal(order=['a', 'b', 'c'])
    ordinal_ltype_2 = Ordinal(order=['b', 'a', 'c'])
    ordinal_col_1 = ColumnSchema(logical_type=ordinal_ltype_1)
    ordinal_col_2 = ColumnSchema(logical_type=ordinal_ltype_2)

    assert col != ordinal_col_1
    assert ordinal_col_1 != ordinal_col_2
    assert ordinal_col_1 == ordinal_col_1

    datetime_ltype_instantiated = Datetime(datetime_format='%Y-%m%d')

    datetime_col_format = ColumnSchema(logical_type=datetime_ltype_instantiated)
    datetime_col_param = ColumnSchema(logical_type=Datetime(datetime_format=None))
    datetime_col_instantiated = ColumnSchema(logical_type=Datetime())
    datetime_col = ColumnSchema(logical_type=Datetime)

    assert datetime_col != datetime_col_instantiated
    assert datetime_col_instantiated != datetime_col_format
    assert datetime_col_instantiated == datetime_col_param
def test_remove_standard_semantic_tag():
    # Check that warning is raised if use_standard_tags is True - tag should be removed
    schema = ColumnSchema(logical_type=Categorical,
                          semantic_tags='tag1',
                          use_standard_tags=True)
    expected_message = 'Standard tags have been removed from "col_name"'
    with pytest.warns(StandardTagsChangedWarning) as record:
        schema._remove_semantic_tags(['tag1', 'category'], 'col_name')
    assert len(record) == 1
    assert record[0].message.args[0] == expected_message
    assert schema.semantic_tags == set()

    # Check that warning is not raised if use_standard_tags is False - tag should be removed
    schema = ColumnSchema(logical_type=Categorical,
                          semantic_tags=['category', 'tag1'],
                          use_standard_tags=False)

    with pytest.warns(None) as record:
        schema._remove_semantic_tags(['tag1', 'category'], 'col_name')
    assert len(record) == 0
    assert schema.semantic_tags == set()

    # Check that warning is not raised if use_standard_tags is False and no Logical Type is specified
    schema = ColumnSchema(semantic_tags=['category', 'tag1'],
                          use_standard_tags=False)

    with pytest.warns(None) as record:
        schema._remove_semantic_tags(['tag1', 'category'], 'col_name')
    assert len(record) == 0
    assert schema.semantic_tags == set()
def test_remove_semantic_tags_raises_error_with_invalid_tag():
    schema = ColumnSchema(logical_type=Categorical, semantic_tags='tag1')
    error_msg = re.escape(
        "Semantic tag(s) 'invalid_tagname' not present on column 'col_name'")
    with pytest.raises(LookupError, match=error_msg):
        schema._remove_semantic_tags('invalid_tagname', 'col_name')
def test_ordinal_without_init():
    schema = ColumnSchema(logical_type=Ordinal)
    assert isinstance(schema.logical_type, Ordinal)
    assert schema.logical_type.order is None
def test_reset_semantic_tags_without_standard_tags():
    semantic_tags = 'initial_tag'
    schema = ColumnSchema(semantic_tags=semantic_tags, use_standard_tags=False)

    schema._reset_semantic_tags()
    assert schema.semantic_tags == set()
def test_schema_shallow_equality():
    no_metadata_1 = ColumnSchema(logical_type=Categorical)
    no_metadata_2 = ColumnSchema(logical_type=Categorical)

    assert no_metadata_1.__eq__(no_metadata_2, deep=False)
    assert no_metadata_1.__eq__(no_metadata_2, deep=True)

    metadata_1 = ColumnSchema(logical_type=Categorical,
                              metadata={"interesting_values": ["a", "b"]})
    metadata_2 = ColumnSchema(logical_type=Categorical,
                              metadata={"interesting_values": ["a", "b"]})
    metadata_3 = ColumnSchema(logical_type=Categorical,
                              metadata={"interesting_values": ["c", "d"]})

    assert metadata_1.__eq__(metadata_2, deep=False)
    assert metadata_1.__eq__(metadata_2, deep=True)
    assert metadata_1.__eq__(metadata_3, deep=False)
    assert not metadata_1.__eq__(metadata_3, deep=True)
def test_schema_equality():
    col = ColumnSchema(logical_type=Categorical)
    diff_description_col = ColumnSchema(logical_type=Categorical,
                                        description="description")
    diff_origin_col = ColumnSchema(logical_type=Categorical, origin="base")
    diff_metadata_col = ColumnSchema(
        logical_type=Categorical, metadata={"interesting_values": ["a", "b"]})
    use_standard_tags_col = ColumnSchema(logical_type=Categorical,
                                         use_standard_tags=True)
    diff_tags_col = ColumnSchema(logical_type=Categorical,
                                 semantic_tags={"new_tag"})

    assert col != diff_description_col
    assert col != diff_origin_col
    assert col != diff_metadata_col
    assert col != use_standard_tags_col
    assert col != diff_tags_col

    # Check columns with same logical types but different parameters
    ordinal_ltype_1 = Ordinal(order=["a", "b", "c"])
    ordinal_ltype_2 = Ordinal(order=["b", "a", "c"])
    ordinal_col_1 = ColumnSchema(logical_type=ordinal_ltype_1)
    ordinal_col_2 = ColumnSchema(logical_type=ordinal_ltype_2)

    assert col != ordinal_col_1
    assert ordinal_col_1 != ordinal_col_2
    assert ordinal_col_1 == ordinal_col_1

    datetime_ltype_instantiated = Datetime(datetime_format="%Y-%m%d")

    datetime_col_format = ColumnSchema(
        logical_type=datetime_ltype_instantiated)
    datetime_col_param = ColumnSchema(logical_type=Datetime(
        datetime_format=None))
    datetime_col_instantiated = ColumnSchema(logical_type=Datetime())

    assert datetime_col_instantiated != datetime_col_format
    assert datetime_col_instantiated == datetime_col_param
Exemple #14
0
    def init(self,
             logical_type=None,
             semantic_tags=None,
             use_standard_tags=True,
             description=None,
             metadata=None,
             schema=None,
             validate=True):
        """Initializes Woodwork typing information for a Series.

        Args:
            logical_type (LogicalType or str, optional): The logical type that should be assigned
                to the series. If no value is provided, the LogicalType for the series will
                be inferred. If the LogicalType provided or inferred does not have a dtype that
                is compatible with the series dtype, an error will be raised.
            semantic_tags (str or list or set, optional): Semantic tags to assign to the series.
                Defaults to an empty set if not specified. There are two options for
                specifying the semantic tags:
                (str) If only one semantic tag is being set, a single string can be passed.
                (list or set) If multiple tags are being set, a list or set of strings can be passed.
            use_standard_tags (bool, optional): If True, will add standard semantic tags to the series
                based on the inferred or specified logical type of the series. Defaults to True.
            description (str, optional): Optional text describing the contents of the series.
            metadata (dict[str -> json serializable], optional): Metadata associated with the series.
            schema (Woodwork.ColumnSchema, optional): Typing information to use for the Series instead of performing inference.
                Any other arguments provided will be ignored. Note that any changes made to the schema object after
                initialization will propagate to the Series. Similarly, to avoid unintended typing information changes,
                the same schema object should not be shared between Series.
            validate (bool, optional): Whether parameter and data validation should occur. Defaults to True. Warning:
                Should be set to False only when parameters and data are known to be valid.
                Any errors resulting from skipping validation with invalid inputs may not be easily understood.
        """

        if schema is not None:
            if validate:
                _validate_schema(schema, self._series)

            extra_params = []
            if logical_type is not None:
                extra_params.append('logical_type')
            if semantic_tags is not None:
                extra_params.append('semantic_tags')
            if description is not None:
                extra_params.append('description')
            if metadata is not None:
                extra_params.append('metadata')
            if not use_standard_tags:
                extra_params.append('use_standard_tags')
            if extra_params:
                warnings.warn(
                    "A schema was provided and the following parameters were ignored: "
                    + ", ".join(extra_params), ParametersIgnoredWarning)

            self._schema = schema
        else:
            logical_type = _get_column_logical_type(self._series, logical_type,
                                                    self._series.name)

            if validate:
                self._validate_logical_type(logical_type)

            self._schema = ColumnSchema(logical_type=logical_type,
                                        semantic_tags=semantic_tags,
                                        use_standard_tags=use_standard_tags,
                                        description=description,
                                        metadata=metadata,
                                        validate=validate)
Exemple #15
0
class LSA(TransformPrimitive):
    """Calculates the Latent Semantic Analysis Values of NaturalLanguage Input

    Description:
        Given a list of strings, transforms those strings using tf-idf and single
        value decomposition to go from a sparse matrix to a compact matrix with two
        values for each string. These values represent that Latent Semantic Analysis
        of each string. These values will represent their context with respect to
        (nltk's gutenberg corpus.)[https://www.nltk.org/book/ch02.html#gutenberg-corpus]

        If a string is missing, return `NaN`.

    Examples:
        >>> lsa = LSA()
        >>> x = ["he helped her walk,", "me me me eat food", "the sentence doth long"]
        >>> res = lsa(x).tolist()
        >>> for i in range(len(res)): res[i] = [abs(round(x, 2)) for x in res[i]]
        >>> res
        [[0.01, 0.01, 0.01], [0.0, 0.0, 0.01]]

        Now, if we change the values of the input corpus, to something that better resembles
        the given text, the same given input text will result in a different, more discerning,
        output. Also, NaN values are handled, as well as strings without words.

        >>> lsa = LSA()
        >>> x = ["the earth is round", "", np.NaN, ".,/"]
        >>> res = lsa(x).tolist()
        >>> for i in range(len(res)): res[i] = [abs(round(x, 2)) for x in res[i]]
        >>> res
        [[0.02, 0.0, nan, 0.0], [0.02, 0.0, nan, 0.0]]

    """
    name = "lsa"
    input_types = [ColumnSchema(logical_type=NaturalLanguage)]
    return_type = ColumnSchema(logical_type=Double, semantic_tags={'numeric'})
    default_value = 0

    def __init__(self):
        # TODO: allow user to use own corpus
        self.number_output_features = 2
        self.n = 2

        gutenberg = nltk.corpus.gutenberg.sents()
        self.trainer = make_pipeline(TfidfVectorizer(), TruncatedSVD())
        self.trainer.fit([" ".join(sent) for sent in gutenberg])

    def get_function(self):
        dtk = TreebankWordDetokenizer()

        def lsa(array):
            array = pd.Series(array, index=pd.Series(array.index), name='array')
            copy = array.dropna()
            copy = copy.apply(lambda x: dtk.detokenize(clean_tokens(x)))
            li = self.trainer.transform(copy)
            lsa1 = pd.Series(li[:, 0], index=copy.index)
            lsa2 = pd.Series(li[:, 1], index=copy.index)
            array = pd.DataFrame(array)
            array['l1'] = lsa1
            array['l2'] = lsa2

            arr = ((np.array(array[['l1', 'l2']])).T).tolist()
            return pd.Series(arr)

        return lsa
Exemple #16
0
class Haversine(TransformPrimitive):
    """Calculates the approximate haversine distance between two LatLong columns.

    Args:
        unit (str): Determines the unit value to output. Could
            be `miles` or `kilometers`. Default is `miles`.

    Examples:
        >>> haversine = Haversine()
        >>> distances = haversine([(42.4, -71.1), (40.0, -122.4)],
        ...                       [(40.0, -122.4), (41.2, -96.75)])
        >>> np.round(distances, 3).tolist()
        [2631.231, 1343.289]

        Output units can be specified

        >>> haversine_km = Haversine(unit='kilometers')
        >>> distances_km = haversine_km([(42.4, -71.1), (40.0, -122.4)],
        ...                             [(40.0, -122.4), (41.2, -96.75)])
        >>> np.round(distances_km, 3).tolist()
        [4234.555, 2161.814]
    """

    name = "haversine"
    input_types = [
        ColumnSchema(logical_type=LatLong),
        ColumnSchema(logical_type=LatLong),
    ]
    return_type = ColumnSchema(semantic_tags={"numeric"})
    commutative = True

    def __init__(self, unit="miles"):
        valid_units = ["miles", "kilometers"]
        if unit not in valid_units:
            error_message = "Invalid unit %s provided. Must be one of %s" % (
                unit,
                valid_units,
            )
            raise ValueError(error_message)
        self.unit = unit
        self.description_template = (
            "the haversine distance in {} between {{}} and {{}}".format(self.unit)
        )

    def get_function(self):
        def haversine(latlong_1, latlong_2):
            latlong_1 = np.array(latlong_1.tolist())
            latlong_2 = np.array(latlong_2.tolist())
            lat_1s = latlong_1[:, 0]
            lat_2s = latlong_2[:, 0]
            lon_1s = latlong_1[:, 1]
            lon_2s = latlong_2[:, 1]

            distance = _haversine_calculate(lat_1s, lon_1s, lat_2s, lon_2s, self.unit)
            return distance

        return haversine

    def generate_name(self, base_feature_names):
        name = "{}(".format(self.name.upper())
        name += ", ".join(base_feature_names)
        if self.unit != "miles":
            name += ", unit={}".format(self.unit)
        name += ")"
        return name
 class TestTime(TransformPrimitive):
     name = "test_time"
     input_types = [ColumnSchema(logical_type=Datetime)]
     return_type = ColumnSchema(semantic_tags={"numeric"})
     number_output_features = 6
Exemple #18
0
 class MultiCumulative(TransformPrimitive):
     name = "multi_cum_sum"
     input_types = [ColumnSchema(semantic_tags={"numeric"})]
     return_type = ColumnSchema(semantic_tags={"numeric"})
     number_output_features = 3
 class ThreeMostCommonCat(AggregationPrimitive):
     name = "n_most_common_categorical"
     input_types = [ColumnSchema(semantic_tags={"category"})]
     return_type = ColumnSchema(semantic_tags={"category"})
     number_output_features = 3
Exemple #20
0
 class NewMax(AggregationPrimitive):
     name = "new_max"
     input_types = [ColumnSchema(semantic_tags={"numeric"})]
     return_type = ColumnSchema(semantic_tags={"numeric"})
Exemple #21
0
def test_column_schema_standard_tags():
    column = ColumnSchema(logical_type=Integer, use_standard_tags=True)

    assert column.semantic_tags == {'numeric'}
    def build_features(self, return_types=None, verbose=False):
        """Automatically builds feature definitions for target
            dataframe using Deep Feature Synthesis algorithm

        Args:
            return_types (list[woodwork.ColumnSchema] or str, optional):
                List of ColumnSchemas defining the types of
                columns to return. If None, defaults to returning all
                numeric, categorical and boolean types. If given as
                the string 'all', use all available return types.

            verbose (bool, optional): If True, print progress.

        Returns:
            list[BaseFeature]: Returns a list of
                features for target dataframe, sorted by feature depth
                (shallow first).
        """
        all_features = {}

        self.where_clauses = defaultdict(set)

        if return_types is None:
            return_types = [
                ColumnSchema(semantic_tags=["numeric"]),
                ColumnSchema(semantic_tags=["category"]),
                ColumnSchema(logical_type=Boolean),
                ColumnSchema(logical_type=BooleanNullable),
            ]
        elif return_types == "all":
            pass
        else:
            msg = "return_types must be a list, or 'all'"
            assert isinstance(return_types, list), msg

        self._run_dfs(
            self.es[self.target_dataframe_name],
            RelationshipPath([]),
            all_features,
            max_depth=self.max_depth,
        )

        new_features = list(all_features[self.target_dataframe_name].values())

        def filt(f):
            # remove identity features of the ID field of the target dataframe
            if (isinstance(f, IdentityFeature)
                    and f.dataframe_name == self.target_dataframe_name
                    and f.column_name
                    == self.es[self.target_dataframe_name].ww.index):
                return False

            return True

        # filter out features with undesired return types
        if return_types != "all":
            new_features = [
                f for f in new_features if any(
                    is_valid_input(f.column_schema, schema)
                    for schema in return_types)
            ]
        new_features = list(filter(filt, new_features))

        new_features.sort(key=lambda f: f.get_depth())

        new_features = self._filter_features(new_features)

        if self.max_features > 0:
            new_features = new_features[:self.max_features]

        if verbose:
            print("Built {} features".format(len(new_features)))
            verbose = None
        return new_features
Exemple #23
0
 class CustomAgg(AggregationPrimitive):
     name = "custom_aggregation"
     input_types = [ColumnSchema(semantic_tags={"category"})]
     output_type = ColumnSchema(semantic_tags={"category"})
    def _build_transform_features(self,
                                  all_features,
                                  dataframe,
                                  max_depth=0,
                                  require_direct_input=False):
        """Creates trans_features for all the columns in a dataframe

        Args:
            all_features (dict[dataframe name: dict->[str->:class:`BaseFeature`]]):
                Dict containing a dict for each dataframe. Each nested dict
                has features as values with their ids as keys

          dataframe (DataFrame): DataFrame to calculate features for.
        """

        new_max_depth = None
        if max_depth is not None:
            new_max_depth = max_depth - 1

        # Keep track of features to add until the end to avoid applying
        # transform primitives to features that were also built by transform primitives
        features_to_add = []

        for trans_prim in self.trans_primitives:
            current_options = self.primitive_options.get(
                trans_prim, self.primitive_options.get(trans_prim.name))
            if ignore_dataframe_for_primitive(current_options, dataframe):
                continue

            input_types = trans_prim.input_types

            matching_inputs = self._get_matching_inputs(
                all_features,
                dataframe,
                new_max_depth,
                input_types,
                trans_prim,
                current_options,
                require_direct_input=require_direct_input,
            )

            for matching_input in matching_inputs:
                if all(bf.number_output_features == 1
                       for bf in matching_input) and check_transform_stacking(
                           matching_input):
                    new_f = TransformFeature(matching_input,
                                             primitive=trans_prim)
                    features_to_add.append(new_f)

        for groupby_prim in self.groupby_trans_primitives:
            current_options = self.primitive_options.get(
                groupby_prim, self.primitive_options.get(groupby_prim.name))
            if ignore_dataframe_for_primitive(current_options,
                                              dataframe,
                                              groupby=True):
                continue
            input_types = groupby_prim.input_types[:]
            matching_inputs = self._get_matching_inputs(
                all_features,
                dataframe,
                new_max_depth,
                input_types,
                groupby_prim,
                current_options,
            )

            # get columns to use as groupbys, use IDs as default unless other groupbys specified
            if any([
                    "include_groupby_columns" in option
                    and dataframe.ww.name in option["include_groupby_columns"]
                    for option in current_options
            ]):
                column_schemas = "all"
            else:
                column_schemas = [ColumnSchema(semantic_tags=["foreign_key"])]
            groupby_matches = self._features_by_type(
                all_features=all_features,
                dataframe=dataframe,
                max_depth=new_max_depth,
                column_schemas=column_schemas,
            )
            groupby_matches = filter_groupby_matches_by_options(
                groupby_matches, current_options)

            # If require_direct_input, require a DirectFeature in input or as a
            # groupby, and don't create features of inputs/groupbys which are
            # all direct features with the same relationship path
            for matching_input in matching_inputs:
                if all(bf.number_output_features == 1
                       for bf in matching_input) and check_transform_stacking(
                           matching_input):
                    for groupby in groupby_matches:
                        if require_direct_input and (
                                _all_direct_and_same_path(matching_input +
                                                          (groupby, ))
                                or not any([
                                    isinstance(feature, DirectFeature)
                                    for feature in (matching_input +
                                                    (groupby, ))
                                ])):
                            continue
                        new_f = GroupByTransformFeature(
                            list(matching_input),
                            groupby=groupby[0],
                            primitive=groupby_prim,
                        )
                        features_to_add.append(new_f)
        for new_f in features_to_add:
            self._handle_new_feature(all_features=all_features,
                                     new_feature=new_f)
 class AboveTen(TransformPrimitive):
     name = "above_ten"
     input_types = [ColumnSchema(semantic_tags={"numeric"})]
     return_type = ColumnSchema(semantic_tags={"numeric"})
def _match_contains_numeric_foreign_key(match):
    match_schema = ColumnSchema(semantic_tags={"foreign_key", "numeric"})
    return any(is_valid_input(f.column_schema, match_schema) for f in match)
Exemple #27
0
class CustomMean(AggregationPrimitive):
    name = "custom_mean"
    input_types = [ColumnSchema(semantic_tags={"numeric"})]
    return_type = ColumnSchema(semantic_tags={"numeric"})
Exemple #28
0
from woodwork.column_schema import ColumnSchema

from featuretools.primitives.base import make_agg_primitive

CustomSum = make_agg_primitive(
    lambda x: sum(x),
    name="CustomSum",
    input_types=[ColumnSchema(semantic_tags={'numeric'})],
    return_type=ColumnSchema(semantic_tags={'numeric'}))
    class CustomMultiOutput(TransformPrimitive):
        name = "custom_multioutput"
        input_types = [ColumnSchema(semantic_tags={'category'})]
        return_type = ColumnSchema(semantic_tags={'category'})

        number_output_features = 4
class CountString(TransformPrimitive):
    """Determines how many times a given string shows up in a text field.
    Args:
        string (str): The string to determine the count of. Defaults to
            the word "the".
        ignore_case (bool): Determines if case of the string should be
            considered or not. Defaults to true.
        ignore_non_alphanumeric (bool): Determines if non-alphanumeric
            characters should be used in the search. Defaults to False.
        is_regex (bool): Defines if the string argument is a regex or not.
            Defaults to False.
        match_whole_words_only (bool): Determines if whole words should be
            matched or not. For example searching for word `the` against
            `then, the, there` should only return `the` if this argument
            was True. Defaults to False.
    Examples:
        >>> count_string = CountString(string="the")
        >>> count_string(["The problem was difficult.",
        ...               "He was there.",
        ...               "The girl went to the store."]).tolist()
        [1, 1, 2]
        >>> # Match case of string
        >>> count_string_ignore_case = CountString(string="the", ignore_case=False)
        >>> count_string_ignore_case(["The problem was difficult.",
        ...                           "He was there.",
        ...                           "The girl went to the store."]).tolist()
        [0, 1, 1]
        >>> # Ignore non-alphanumeric characters in the search
        >>> count_string_ignore_non_alphanumeric = CountString(string="the",
        ...                                                    ignore_non_alphanumeric=True)
        >>> count_string_ignore_non_alphanumeric(["Th*/e problem was difficult.",
        ...                                       "He was there.",
        ...                                       "The girl went to the store."]).tolist()
        [1, 1, 2]
        >>> # Specify the string as a regex
        >>> count_string_is_regex = CountString(string="t.e", is_regex=True)
        >>> count_string_is_regex(["The problem was difficult.",
        ...                        "He was there.",
        ...                        "The girl went to the store."]).tolist()
        [1, 1, 2]
        >>> # Match whole words only
        >>> count_string_match_whole_words_only = CountString(string="the",
        ...                                                   match_whole_words_only=True)
        >>> count_string_match_whole_words_only(["The problem was difficult.",
        ...                                      "He was there.",
        ...                                      "The girl went to the store."]).tolist()
        [1, 0, 2]
    """
    name = "count_string"
    input_types = [ColumnSchema(logical_type=NaturalLanguage)]
    return_type = ColumnSchema(logical_type=Integer, semantic_tags={'numeric'})

    def __init__(self,
                 string='the',
                 ignore_case=True,
                 ignore_non_alphanumeric=False,
                 is_regex=False,
                 match_whole_words_only=False):
        self.string = string
        self.ignore_case = ignore_case
        self.ignore_non_alphanumeric = ignore_non_alphanumeric
        self.match_whole_words_only = match_whole_words_only
        self.is_regex = is_regex

        # we don't want to strip non alphanumeric characters from the pattern
        # ie h.ll. should match "hello" so we can't strip the dots to make hll
        if not is_regex:
            self.pattern = re.escape(self.process_text(string))
        else:
            self.pattern = string
            if ignore_case:
                self.pattern = self.pattern.lower()

        # \b\b.*\b\b is the same as \b.*\b so we don't have to check if
        # the pattern is given to us as regex and if it already has leading
        # and trailing \b's
        if match_whole_words_only:
            self.pattern = "\\b" + self.pattern + "\\b"

    def process_text(self, text):
        if self.ignore_non_alphanumeric:
            text = re.sub('[^0-9a-zA-Z ]+', '', text)
        if self.ignore_case:
            text = text.lower()
        return text

    def get_function(self):
        def count_string(words):
            if type(words) != str:
                return np.nan
            words = self.process_text(words)
            return len(re.findall(self.pattern, words))

        return np.vectorize(count_string)