class WoodworkColumnAccessor: def __init__(self, series): self._series = series self._schema = None def init(self, logical_type=None, semantic_tags=None, use_standard_tags=True, description=None, metadata=None, schema=None, validate=True): """Initializes Woodwork typing information for a Series. Args: logical_type (LogicalType or str, optional): The logical type that should be assigned to the series. If no value is provided, the LogicalType for the series will be inferred. If the LogicalType provided or inferred does not have a dtype that is compatible with the series dtype, an error will be raised. semantic_tags (str or list or set, optional): Semantic tags to assign to the series. Defaults to an empty set if not specified. There are two options for specifying the semantic tags: (str) If only one semantic tag is being set, a single string can be passed. (list or set) If multiple tags are being set, a list or set of strings can be passed. use_standard_tags (bool, optional): If True, will add standard semantic tags to the series based on the inferred or specified logical type of the series. Defaults to True. description (str, optional): Optional text describing the contents of the series. metadata (dict[str -> json serializable], optional): Metadata associated with the series. schema (Woodwork.ColumnSchema, optional): Typing information to use for the Series instead of performing inference. Any other arguments provided will be ignored. Note that any changes made to the schema object after initialization will propagate to the Series. Similarly, to avoid unintended typing information changes, the same schema object should not be shared between Series. validate (bool, optional): Whether parameter and data validation should occur. Defaults to True. Warning: Should be set to False only when parameters and data are known to be valid. Any errors resulting from skipping validation with invalid inputs may not be easily understood. """ if schema is not None: if validate: _validate_schema(schema, self._series) extra_params = [] if logical_type is not None: extra_params.append('logical_type') if semantic_tags is not None: extra_params.append('semantic_tags') if description is not None: extra_params.append('description') if metadata is not None: extra_params.append('metadata') if not use_standard_tags: extra_params.append('use_standard_tags') if extra_params: warnings.warn( "A schema was provided and the following parameters were ignored: " + ", ".join(extra_params), ParametersIgnoredWarning) self._schema = schema else: logical_type = _get_column_logical_type(self._series, logical_type, self._series.name) if validate: self._validate_logical_type(logical_type) self._schema = ColumnSchema(logical_type=logical_type, semantic_tags=semantic_tags, use_standard_tags=use_standard_tags, description=description, metadata=metadata, validate=validate) @property def schema(self): return copy.deepcopy(self._schema) @property def description(self): """The description of the series""" if self._schema is None: _raise_init_error() return self._schema.description @description.setter def description(self, description): if self._schema is None: _raise_init_error() _validate_description(description) self._schema.description = description @property def iloc(self): """ Integer-location based indexing for selection by position. ``.iloc[]`` is primarily integer position based (from ``0`` to ``length-1`` of the axis), but may also be used with a boolean array. If the selection result is a Series, Woodwork typing information will be initialized for the returned Series. Allowed inputs are: An integer, e.g. ``5``. A list or array of integers, e.g. ``[4, 3, 0]``. A slice object with ints, e.g. ``1:7``. A boolean array. A ``callable`` function with one argument (the calling Series, DataFrame or Panel) and that returns valid output for indexing (one of the above). This is useful in method chains, when you don't have a reference to the calling object, but would like to base your selection on some value. """ if self._schema is None: _raise_init_error() return _iLocIndexer(self._series) @property def loc(self): """ Access a group of rows by label(s) or a boolean array. ``.loc[]`` is primarily label based, but may also be used with a boolean array. If the selection result is a Series, Woodwork typing information will be initialized for the returned Series. Allowed inputs are: A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is interpreted as a *label* of the index, and **never** as an integer position along the index). A list or array of labels, e.g. ``['a', 'b', 'c']``. A slice object with labels, e.g. ``'a':'f'``. A boolean array of the same length as the axis being sliced, e.g. ``[True, False, True]``. An alignable boolean Series. The index of the key will be aligned before masking. An alignable Index. The Index of the returned selection will be the input. A ``callable`` function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above) """ if self._schema is None: _raise_init_error() return _locIndexer(self._series) @property def logical_type(self): """The logical type of the series""" if self._schema is None: _raise_init_error() return self._schema.logical_type @property def metadata(self): """The metadata of the series""" if self._schema is None: _raise_init_error() return self._schema.metadata @metadata.setter def metadata(self, metadata): if self._schema is None: _raise_init_error() _validate_metadata(metadata) self._schema.metadata = metadata @property def semantic_tags(self): """The semantic tags assigned to the series""" if self._schema is None: _raise_init_error() return self._schema.semantic_tags @property def use_standard_tags(self): if self._schema is None: _raise_init_error() return self._schema.use_standard_tags def __eq__(self, other): if self._schema != other._schema: return False if self._series.name != other._series.name: return False if isinstance(self._series, pd.Series): return self._series.equals(other._series) return True def __getattr__(self, attr): # If the method is present on the Accessor, uses that method. # If the method is present on Series, uses that method. if self._schema is None: _raise_init_error() if hasattr(self._series, attr): return self._make_series_call(attr) else: raise AttributeError(f"Woodwork has no attribute '{attr}'") def __repr__(self): if self._schema is None: _raise_init_error() msg = u"<Series: {} ".format(self._series.name) msg += u"(Physical Type = {}) ".format(self._series.dtype) msg += u"(Logical Type = {}) ".format(self.logical_type) msg += u"(Semantic Tags = {})>".format(self.semantic_tags) return msg def _make_series_call(self, attr): """Forwards the requested attribute onto the series object. Intercepts return value, attempting to initialize Woodwork with the current schema when a new Series is returned. Confirms schema is still valid for the original Series.""" series_attr = getattr(self._series, attr) if callable(series_attr): def wrapper(*args, **kwargs): # Make Series call and intercept the result result = series_attr(*args, **kwargs) # Try to initialize Woodwork with the existing schema if _is_series(result): valid_dtype = _get_valid_dtype(type(result), self._schema.logical_type) if str(result.dtype) == valid_dtype: result.ww.init(schema=self.schema, validate=False) else: invalid_schema_message = 'dtype mismatch between original dtype, ' \ f'{valid_dtype}, and returned dtype, {result.dtype}' warning_message = TypingInfoMismatchWarning( ).get_warning_message(attr, invalid_schema_message, 'Series') warnings.warn(warning_message, TypingInfoMismatchWarning) # Always return the results of the Series operation whether or not Woodwork is initialized return result return wrapper # Directly return non-callable Series attributes return series_attr def _validate_logical_type(self, logical_type): """Validates that a logical type is consistent with the series dtype. Performs additional type specific validation, as required.""" valid_dtype = _get_valid_dtype(type(self._series), logical_type) if valid_dtype != str(self._series.dtype): raise ValueError( f"Cannot initialize Woodwork. Series dtype '{self._series.dtype}' is " f"incompatible with {logical_type} dtype. Try converting series " f"dtype to '{valid_dtype}' before initializing or use the " "woodwork.init_series function to initialize.") if isinstance(logical_type, Ordinal): logical_type._validate_data(self._series) elif logical_type == LatLong: if not _is_valid_latlong_series(self._series): raise ValueError( "Cannot initialize Woodwork. Series does not contain properly formatted " "LatLong data. Try reformatting before initializing or use the " "woodwork.init_series function to initialize.") def add_semantic_tags(self, semantic_tags): """Add the specified semantic tags to the set of tags. Args: semantic_tags (str/list/set): New semantic tag(s) to add """ if self._schema is None: _raise_init_error() self._schema._add_semantic_tags(semantic_tags, self._series.name) def remove_semantic_tags(self, semantic_tags): """Removes specified semantic tags from the current tags. Args: semantic_tags (str/list/set): Semantic tag(s) to remove. """ if self._schema is None: _raise_init_error() self._schema._remove_semantic_tags(semantic_tags, self._series.name) def reset_semantic_tags(self): """Reset the semantic tags to the default values. The default values will be either an empty set or a set of the standard tags based on the column logical type, controlled by the use_standard_tags property. Args: None """ if self._schema is None: _raise_init_error() self._schema._reset_semantic_tags() def set_logical_type(self, logical_type): """Update the logical type for the series, clearing any previously set semantic tags, and returning a new series with Woodwork initialied. Args: logical_type (LogicalType, str): The new logical type to set for the series. Returns: Series: A new series with the updated logical type. """ if self._schema is None: _raise_init_error() # Create a new series without a schema to prevent new series from sharing a common # schema with current series new_series = self._series.copy() new_series._schema = None return init_series(new_series, logical_type=logical_type, semantic_tags=None, use_standard_tags=self._schema.use_standard_tags, description=self.description, metadata=copy.deepcopy(self.metadata)) def set_semantic_tags(self, semantic_tags): """Replace current semantic tags with new values. If `use_standard_tags` is set to True for the series, any standard tags associated with the LogicalType of the series will be added as well. Args: semantic_tags (str/list/set): New semantic tag(s) to set """ if self._schema is None: _raise_init_error() self._schema._set_semantic_tags(semantic_tags)
class MaxAboveTen(AggregationPrimitive): name = "max_above_ten" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"})
class DirectFeature(FeatureBase): """Feature for child dataframe that inherits a feature value from a parent dataframe""" input_types = [ColumnSchema()] return_type = None def __init__( self, base_feature, child_dataframe_name, relationship=None, name=None ): base_feature = _validate_base_features(base_feature)[0] self.parent_dataframe_name = base_feature.dataframe_name relationship = self._handle_relationship( base_feature.entityset, child_dataframe_name, relationship ) child_dataframe = base_feature.entityset[child_dataframe_name] super(DirectFeature, self).__init__( dataframe=child_dataframe, base_features=[base_feature], relationship_path=RelationshipPath([(True, relationship)]), primitive=PrimitiveBase, name=name, ) def _handle_relationship(self, entityset, child_dataframe_name, relationship): child_dataframe = entityset[child_dataframe_name] if relationship: relationship_child = relationship.child_dataframe assert ( child_dataframe.ww.name == relationship_child.ww.name ), "child_dataframe must be the relationship child dataframe" assert ( self.parent_dataframe_name == relationship.parent_dataframe.ww.name ), "Base feature must be defined on the relationship parent dataframe" else: child_relationships = entityset.get_forward_relationships( child_dataframe.ww.name ) possible_relationships = ( r for r in child_relationships if r.parent_dataframe.ww.name == self.parent_dataframe_name ) relationship = next(possible_relationships, None) if not relationship: raise RuntimeError( 'No relationship from "%s" to "%s" found.' % (child_dataframe.ww.name, self.parent_dataframe_name) ) # Check for another path. elif next(possible_relationships, None): message = ( "There are multiple relationships to the base dataframe. " "You must specify a relationship." ) raise RuntimeError(message) return relationship @classmethod def from_dictionary(cls, arguments, entityset, dependencies, primitive): base_feature = dependencies[arguments["base_feature"]] relationship = Relationship.from_dictionary( arguments["relationship"], entityset ) child_dataframe_name = relationship.child_dataframe.ww.name return cls( base_feature=base_feature, child_dataframe_name=child_dataframe_name, relationship=relationship, name=arguments["name"], ) @property def number_output_features(self): return self.base_features[0].number_output_features @property def default_value(self): return self.base_features[0].default_value def copy(self): """Return copy of feature""" _is_forward, relationship = self.relationship_path[0] return DirectFeature( self.base_features[0], self.dataframe_name, relationship=relationship ) @property def column_schema(self): return self.base_features[0].column_schema def generate_name(self): return self._name_from_base(self.base_features[0].get_name()) def generate_names(self): return [ self._name_from_base(base_name) for base_name in self.base_features[0].get_feature_names() ] def get_arguments(self): _is_forward, relationship = self.relationship_path[0] return { "name": self.get_name(), "base_feature": self.base_features[0].unique_name(), "relationship": relationship.to_dictionary(), } def _name_from_base(self, base_name): return "%s.%s" % (self.relationship_path_name(), base_name)
def test_column_schema_params(): column = ColumnSchema(logical_type=Integer, description='this is a column!', metadata={'created_by': 'user1'}) assert column.description == 'this is a column!' assert column.metadata == {'created_by': 'user1'}
class CustomTrans(TransformPrimitive): name = "custom_transform" input_types = [ColumnSchema(semantic_tags={"category"})] output_type = ColumnSchema(semantic_tags={"category"})
def test_is_valid_input(): assert is_valid_input(candidate=ColumnSchema(), template=ColumnSchema()) assert is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={'index'}), template=ColumnSchema(logical_type=Integer, semantic_tags={'index'})) assert is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={'index', 'numeric'}), template=ColumnSchema(semantic_tags={'index'})) assert is_valid_input(candidate=ColumnSchema(semantic_tags={'index'}), template=ColumnSchema(semantic_tags={'index'})) assert is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={'index'}), template=ColumnSchema()) assert is_valid_input(candidate=ColumnSchema(logical_type=Integer), template=ColumnSchema(logical_type=Integer)) assert is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={'numeric'}), template=ColumnSchema(logical_type=Integer)) assert not is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={'index'}), template=ColumnSchema(logical_type=Double, semantic_tags={'index'})) assert not is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={}), template=ColumnSchema(logical_type=Integer, semantic_tags={'index'})) assert not is_valid_input(candidate=ColumnSchema(), template=ColumnSchema(logical_type=Integer, semantic_tags={'index'})) assert not is_valid_input(candidate=ColumnSchema(), template=ColumnSchema(logical_type=Integer)) assert not is_valid_input(candidate=ColumnSchema(), template=ColumnSchema(semantic_tags={'index'}))
def test_schema_equality(): col = ColumnSchema(logical_type=Categorical) diff_description_col = ColumnSchema(logical_type=Categorical, description='description') diff_metadata_col = ColumnSchema(logical_type=Categorical, metadata={'interesting_values': ['a', 'b']}) use_standard_tags_col = ColumnSchema(logical_type=Categorical, use_standard_tags=True) diff_tags_col = ColumnSchema(logical_type=Categorical, semantic_tags={'new_tag'}) assert col != diff_description_col assert col != diff_metadata_col assert col != use_standard_tags_col assert col != diff_tags_col # Check columns with same logical types but different parameters ordinal_ltype_1 = Ordinal(order=['a', 'b', 'c']) ordinal_ltype_2 = Ordinal(order=['b', 'a', 'c']) ordinal_col_1 = ColumnSchema(logical_type=ordinal_ltype_1) ordinal_col_2 = ColumnSchema(logical_type=ordinal_ltype_2) assert col != ordinal_col_1 assert ordinal_col_1 != ordinal_col_2 assert ordinal_col_1 == ordinal_col_1 datetime_ltype_instantiated = Datetime(datetime_format='%Y-%m%d') datetime_col_format = ColumnSchema(logical_type=datetime_ltype_instantiated) datetime_col_param = ColumnSchema(logical_type=Datetime(datetime_format=None)) datetime_col_instantiated = ColumnSchema(logical_type=Datetime()) datetime_col = ColumnSchema(logical_type=Datetime) assert datetime_col != datetime_col_instantiated assert datetime_col_instantiated != datetime_col_format assert datetime_col_instantiated == datetime_col_param
def test_remove_standard_semantic_tag(): # Check that warning is raised if use_standard_tags is True - tag should be removed schema = ColumnSchema(logical_type=Categorical, semantic_tags='tag1', use_standard_tags=True) expected_message = 'Standard tags have been removed from "col_name"' with pytest.warns(StandardTagsChangedWarning) as record: schema._remove_semantic_tags(['tag1', 'category'], 'col_name') assert len(record) == 1 assert record[0].message.args[0] == expected_message assert schema.semantic_tags == set() # Check that warning is not raised if use_standard_tags is False - tag should be removed schema = ColumnSchema(logical_type=Categorical, semantic_tags=['category', 'tag1'], use_standard_tags=False) with pytest.warns(None) as record: schema._remove_semantic_tags(['tag1', 'category'], 'col_name') assert len(record) == 0 assert schema.semantic_tags == set() # Check that warning is not raised if use_standard_tags is False and no Logical Type is specified schema = ColumnSchema(semantic_tags=['category', 'tag1'], use_standard_tags=False) with pytest.warns(None) as record: schema._remove_semantic_tags(['tag1', 'category'], 'col_name') assert len(record) == 0 assert schema.semantic_tags == set()
def test_remove_semantic_tags_raises_error_with_invalid_tag(): schema = ColumnSchema(logical_type=Categorical, semantic_tags='tag1') error_msg = re.escape( "Semantic tag(s) 'invalid_tagname' not present on column 'col_name'") with pytest.raises(LookupError, match=error_msg): schema._remove_semantic_tags('invalid_tagname', 'col_name')
def test_ordinal_without_init(): schema = ColumnSchema(logical_type=Ordinal) assert isinstance(schema.logical_type, Ordinal) assert schema.logical_type.order is None
def test_reset_semantic_tags_without_standard_tags(): semantic_tags = 'initial_tag' schema = ColumnSchema(semantic_tags=semantic_tags, use_standard_tags=False) schema._reset_semantic_tags() assert schema.semantic_tags == set()
def test_schema_shallow_equality(): no_metadata_1 = ColumnSchema(logical_type=Categorical) no_metadata_2 = ColumnSchema(logical_type=Categorical) assert no_metadata_1.__eq__(no_metadata_2, deep=False) assert no_metadata_1.__eq__(no_metadata_2, deep=True) metadata_1 = ColumnSchema(logical_type=Categorical, metadata={"interesting_values": ["a", "b"]}) metadata_2 = ColumnSchema(logical_type=Categorical, metadata={"interesting_values": ["a", "b"]}) metadata_3 = ColumnSchema(logical_type=Categorical, metadata={"interesting_values": ["c", "d"]}) assert metadata_1.__eq__(metadata_2, deep=False) assert metadata_1.__eq__(metadata_2, deep=True) assert metadata_1.__eq__(metadata_3, deep=False) assert not metadata_1.__eq__(metadata_3, deep=True)
def test_schema_equality(): col = ColumnSchema(logical_type=Categorical) diff_description_col = ColumnSchema(logical_type=Categorical, description="description") diff_origin_col = ColumnSchema(logical_type=Categorical, origin="base") diff_metadata_col = ColumnSchema( logical_type=Categorical, metadata={"interesting_values": ["a", "b"]}) use_standard_tags_col = ColumnSchema(logical_type=Categorical, use_standard_tags=True) diff_tags_col = ColumnSchema(logical_type=Categorical, semantic_tags={"new_tag"}) assert col != diff_description_col assert col != diff_origin_col assert col != diff_metadata_col assert col != use_standard_tags_col assert col != diff_tags_col # Check columns with same logical types but different parameters ordinal_ltype_1 = Ordinal(order=["a", "b", "c"]) ordinal_ltype_2 = Ordinal(order=["b", "a", "c"]) ordinal_col_1 = ColumnSchema(logical_type=ordinal_ltype_1) ordinal_col_2 = ColumnSchema(logical_type=ordinal_ltype_2) assert col != ordinal_col_1 assert ordinal_col_1 != ordinal_col_2 assert ordinal_col_1 == ordinal_col_1 datetime_ltype_instantiated = Datetime(datetime_format="%Y-%m%d") datetime_col_format = ColumnSchema( logical_type=datetime_ltype_instantiated) datetime_col_param = ColumnSchema(logical_type=Datetime( datetime_format=None)) datetime_col_instantiated = ColumnSchema(logical_type=Datetime()) assert datetime_col_instantiated != datetime_col_format assert datetime_col_instantiated == datetime_col_param
def init(self, logical_type=None, semantic_tags=None, use_standard_tags=True, description=None, metadata=None, schema=None, validate=True): """Initializes Woodwork typing information for a Series. Args: logical_type (LogicalType or str, optional): The logical type that should be assigned to the series. If no value is provided, the LogicalType for the series will be inferred. If the LogicalType provided or inferred does not have a dtype that is compatible with the series dtype, an error will be raised. semantic_tags (str or list or set, optional): Semantic tags to assign to the series. Defaults to an empty set if not specified. There are two options for specifying the semantic tags: (str) If only one semantic tag is being set, a single string can be passed. (list or set) If multiple tags are being set, a list or set of strings can be passed. use_standard_tags (bool, optional): If True, will add standard semantic tags to the series based on the inferred or specified logical type of the series. Defaults to True. description (str, optional): Optional text describing the contents of the series. metadata (dict[str -> json serializable], optional): Metadata associated with the series. schema (Woodwork.ColumnSchema, optional): Typing information to use for the Series instead of performing inference. Any other arguments provided will be ignored. Note that any changes made to the schema object after initialization will propagate to the Series. Similarly, to avoid unintended typing information changes, the same schema object should not be shared between Series. validate (bool, optional): Whether parameter and data validation should occur. Defaults to True. Warning: Should be set to False only when parameters and data are known to be valid. Any errors resulting from skipping validation with invalid inputs may not be easily understood. """ if schema is not None: if validate: _validate_schema(schema, self._series) extra_params = [] if logical_type is not None: extra_params.append('logical_type') if semantic_tags is not None: extra_params.append('semantic_tags') if description is not None: extra_params.append('description') if metadata is not None: extra_params.append('metadata') if not use_standard_tags: extra_params.append('use_standard_tags') if extra_params: warnings.warn( "A schema was provided and the following parameters were ignored: " + ", ".join(extra_params), ParametersIgnoredWarning) self._schema = schema else: logical_type = _get_column_logical_type(self._series, logical_type, self._series.name) if validate: self._validate_logical_type(logical_type) self._schema = ColumnSchema(logical_type=logical_type, semantic_tags=semantic_tags, use_standard_tags=use_standard_tags, description=description, metadata=metadata, validate=validate)
class LSA(TransformPrimitive): """Calculates the Latent Semantic Analysis Values of NaturalLanguage Input Description: Given a list of strings, transforms those strings using tf-idf and single value decomposition to go from a sparse matrix to a compact matrix with two values for each string. These values represent that Latent Semantic Analysis of each string. These values will represent their context with respect to (nltk's gutenberg corpus.)[https://www.nltk.org/book/ch02.html#gutenberg-corpus] If a string is missing, return `NaN`. Examples: >>> lsa = LSA() >>> x = ["he helped her walk,", "me me me eat food", "the sentence doth long"] >>> res = lsa(x).tolist() >>> for i in range(len(res)): res[i] = [abs(round(x, 2)) for x in res[i]] >>> res [[0.01, 0.01, 0.01], [0.0, 0.0, 0.01]] Now, if we change the values of the input corpus, to something that better resembles the given text, the same given input text will result in a different, more discerning, output. Also, NaN values are handled, as well as strings without words. >>> lsa = LSA() >>> x = ["the earth is round", "", np.NaN, ".,/"] >>> res = lsa(x).tolist() >>> for i in range(len(res)): res[i] = [abs(round(x, 2)) for x in res[i]] >>> res [[0.02, 0.0, nan, 0.0], [0.02, 0.0, nan, 0.0]] """ name = "lsa" input_types = [ColumnSchema(logical_type=NaturalLanguage)] return_type = ColumnSchema(logical_type=Double, semantic_tags={'numeric'}) default_value = 0 def __init__(self): # TODO: allow user to use own corpus self.number_output_features = 2 self.n = 2 gutenberg = nltk.corpus.gutenberg.sents() self.trainer = make_pipeline(TfidfVectorizer(), TruncatedSVD()) self.trainer.fit([" ".join(sent) for sent in gutenberg]) def get_function(self): dtk = TreebankWordDetokenizer() def lsa(array): array = pd.Series(array, index=pd.Series(array.index), name='array') copy = array.dropna() copy = copy.apply(lambda x: dtk.detokenize(clean_tokens(x))) li = self.trainer.transform(copy) lsa1 = pd.Series(li[:, 0], index=copy.index) lsa2 = pd.Series(li[:, 1], index=copy.index) array = pd.DataFrame(array) array['l1'] = lsa1 array['l2'] = lsa2 arr = ((np.array(array[['l1', 'l2']])).T).tolist() return pd.Series(arr) return lsa
class Haversine(TransformPrimitive): """Calculates the approximate haversine distance between two LatLong columns. Args: unit (str): Determines the unit value to output. Could be `miles` or `kilometers`. Default is `miles`. Examples: >>> haversine = Haversine() >>> distances = haversine([(42.4, -71.1), (40.0, -122.4)], ... [(40.0, -122.4), (41.2, -96.75)]) >>> np.round(distances, 3).tolist() [2631.231, 1343.289] Output units can be specified >>> haversine_km = Haversine(unit='kilometers') >>> distances_km = haversine_km([(42.4, -71.1), (40.0, -122.4)], ... [(40.0, -122.4), (41.2, -96.75)]) >>> np.round(distances_km, 3).tolist() [4234.555, 2161.814] """ name = "haversine" input_types = [ ColumnSchema(logical_type=LatLong), ColumnSchema(logical_type=LatLong), ] return_type = ColumnSchema(semantic_tags={"numeric"}) commutative = True def __init__(self, unit="miles"): valid_units = ["miles", "kilometers"] if unit not in valid_units: error_message = "Invalid unit %s provided. Must be one of %s" % ( unit, valid_units, ) raise ValueError(error_message) self.unit = unit self.description_template = ( "the haversine distance in {} between {{}} and {{}}".format(self.unit) ) def get_function(self): def haversine(latlong_1, latlong_2): latlong_1 = np.array(latlong_1.tolist()) latlong_2 = np.array(latlong_2.tolist()) lat_1s = latlong_1[:, 0] lat_2s = latlong_2[:, 0] lon_1s = latlong_1[:, 1] lon_2s = latlong_2[:, 1] distance = _haversine_calculate(lat_1s, lon_1s, lat_2s, lon_2s, self.unit) return distance return haversine def generate_name(self, base_feature_names): name = "{}(".format(self.name.upper()) name += ", ".join(base_feature_names) if self.unit != "miles": name += ", unit={}".format(self.unit) name += ")" return name
class TestTime(TransformPrimitive): name = "test_time" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(semantic_tags={"numeric"}) number_output_features = 6
class MultiCumulative(TransformPrimitive): name = "multi_cum_sum" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) number_output_features = 3
class ThreeMostCommonCat(AggregationPrimitive): name = "n_most_common_categorical" input_types = [ColumnSchema(semantic_tags={"category"})] return_type = ColumnSchema(semantic_tags={"category"}) number_output_features = 3
class NewMax(AggregationPrimitive): name = "new_max" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"})
def test_column_schema_standard_tags(): column = ColumnSchema(logical_type=Integer, use_standard_tags=True) assert column.semantic_tags == {'numeric'}
def build_features(self, return_types=None, verbose=False): """Automatically builds feature definitions for target dataframe using Deep Feature Synthesis algorithm Args: return_types (list[woodwork.ColumnSchema] or str, optional): List of ColumnSchemas defining the types of columns to return. If None, defaults to returning all numeric, categorical and boolean types. If given as the string 'all', use all available return types. verbose (bool, optional): If True, print progress. Returns: list[BaseFeature]: Returns a list of features for target dataframe, sorted by feature depth (shallow first). """ all_features = {} self.where_clauses = defaultdict(set) if return_types is None: return_types = [ ColumnSchema(semantic_tags=["numeric"]), ColumnSchema(semantic_tags=["category"]), ColumnSchema(logical_type=Boolean), ColumnSchema(logical_type=BooleanNullable), ] elif return_types == "all": pass else: msg = "return_types must be a list, or 'all'" assert isinstance(return_types, list), msg self._run_dfs( self.es[self.target_dataframe_name], RelationshipPath([]), all_features, max_depth=self.max_depth, ) new_features = list(all_features[self.target_dataframe_name].values()) def filt(f): # remove identity features of the ID field of the target dataframe if (isinstance(f, IdentityFeature) and f.dataframe_name == self.target_dataframe_name and f.column_name == self.es[self.target_dataframe_name].ww.index): return False return True # filter out features with undesired return types if return_types != "all": new_features = [ f for f in new_features if any( is_valid_input(f.column_schema, schema) for schema in return_types) ] new_features = list(filter(filt, new_features)) new_features.sort(key=lambda f: f.get_depth()) new_features = self._filter_features(new_features) if self.max_features > 0: new_features = new_features[:self.max_features] if verbose: print("Built {} features".format(len(new_features))) verbose = None return new_features
class CustomAgg(AggregationPrimitive): name = "custom_aggregation" input_types = [ColumnSchema(semantic_tags={"category"})] output_type = ColumnSchema(semantic_tags={"category"})
def _build_transform_features(self, all_features, dataframe, max_depth=0, require_direct_input=False): """Creates trans_features for all the columns in a dataframe Args: all_features (dict[dataframe name: dict->[str->:class:`BaseFeature`]]): Dict containing a dict for each dataframe. Each nested dict has features as values with their ids as keys dataframe (DataFrame): DataFrame to calculate features for. """ new_max_depth = None if max_depth is not None: new_max_depth = max_depth - 1 # Keep track of features to add until the end to avoid applying # transform primitives to features that were also built by transform primitives features_to_add = [] for trans_prim in self.trans_primitives: current_options = self.primitive_options.get( trans_prim, self.primitive_options.get(trans_prim.name)) if ignore_dataframe_for_primitive(current_options, dataframe): continue input_types = trans_prim.input_types matching_inputs = self._get_matching_inputs( all_features, dataframe, new_max_depth, input_types, trans_prim, current_options, require_direct_input=require_direct_input, ) for matching_input in matching_inputs: if all(bf.number_output_features == 1 for bf in matching_input) and check_transform_stacking( matching_input): new_f = TransformFeature(matching_input, primitive=trans_prim) features_to_add.append(new_f) for groupby_prim in self.groupby_trans_primitives: current_options = self.primitive_options.get( groupby_prim, self.primitive_options.get(groupby_prim.name)) if ignore_dataframe_for_primitive(current_options, dataframe, groupby=True): continue input_types = groupby_prim.input_types[:] matching_inputs = self._get_matching_inputs( all_features, dataframe, new_max_depth, input_types, groupby_prim, current_options, ) # get columns to use as groupbys, use IDs as default unless other groupbys specified if any([ "include_groupby_columns" in option and dataframe.ww.name in option["include_groupby_columns"] for option in current_options ]): column_schemas = "all" else: column_schemas = [ColumnSchema(semantic_tags=["foreign_key"])] groupby_matches = self._features_by_type( all_features=all_features, dataframe=dataframe, max_depth=new_max_depth, column_schemas=column_schemas, ) groupby_matches = filter_groupby_matches_by_options( groupby_matches, current_options) # If require_direct_input, require a DirectFeature in input or as a # groupby, and don't create features of inputs/groupbys which are # all direct features with the same relationship path for matching_input in matching_inputs: if all(bf.number_output_features == 1 for bf in matching_input) and check_transform_stacking( matching_input): for groupby in groupby_matches: if require_direct_input and ( _all_direct_and_same_path(matching_input + (groupby, )) or not any([ isinstance(feature, DirectFeature) for feature in (matching_input + (groupby, )) ])): continue new_f = GroupByTransformFeature( list(matching_input), groupby=groupby[0], primitive=groupby_prim, ) features_to_add.append(new_f) for new_f in features_to_add: self._handle_new_feature(all_features=all_features, new_feature=new_f)
class AboveTen(TransformPrimitive): name = "above_ten" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"})
def _match_contains_numeric_foreign_key(match): match_schema = ColumnSchema(semantic_tags={"foreign_key", "numeric"}) return any(is_valid_input(f.column_schema, match_schema) for f in match)
class CustomMean(AggregationPrimitive): name = "custom_mean" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"})
from woodwork.column_schema import ColumnSchema from featuretools.primitives.base import make_agg_primitive CustomSum = make_agg_primitive( lambda x: sum(x), name="CustomSum", input_types=[ColumnSchema(semantic_tags={'numeric'})], return_type=ColumnSchema(semantic_tags={'numeric'}))
class CustomMultiOutput(TransformPrimitive): name = "custom_multioutput" input_types = [ColumnSchema(semantic_tags={'category'})] return_type = ColumnSchema(semantic_tags={'category'}) number_output_features = 4
class CountString(TransformPrimitive): """Determines how many times a given string shows up in a text field. Args: string (str): The string to determine the count of. Defaults to the word "the". ignore_case (bool): Determines if case of the string should be considered or not. Defaults to true. ignore_non_alphanumeric (bool): Determines if non-alphanumeric characters should be used in the search. Defaults to False. is_regex (bool): Defines if the string argument is a regex or not. Defaults to False. match_whole_words_only (bool): Determines if whole words should be matched or not. For example searching for word `the` against `then, the, there` should only return `the` if this argument was True. Defaults to False. Examples: >>> count_string = CountString(string="the") >>> count_string(["The problem was difficult.", ... "He was there.", ... "The girl went to the store."]).tolist() [1, 1, 2] >>> # Match case of string >>> count_string_ignore_case = CountString(string="the", ignore_case=False) >>> count_string_ignore_case(["The problem was difficult.", ... "He was there.", ... "The girl went to the store."]).tolist() [0, 1, 1] >>> # Ignore non-alphanumeric characters in the search >>> count_string_ignore_non_alphanumeric = CountString(string="the", ... ignore_non_alphanumeric=True) >>> count_string_ignore_non_alphanumeric(["Th*/e problem was difficult.", ... "He was there.", ... "The girl went to the store."]).tolist() [1, 1, 2] >>> # Specify the string as a regex >>> count_string_is_regex = CountString(string="t.e", is_regex=True) >>> count_string_is_regex(["The problem was difficult.", ... "He was there.", ... "The girl went to the store."]).tolist() [1, 1, 2] >>> # Match whole words only >>> count_string_match_whole_words_only = CountString(string="the", ... match_whole_words_only=True) >>> count_string_match_whole_words_only(["The problem was difficult.", ... "He was there.", ... "The girl went to the store."]).tolist() [1, 0, 2] """ name = "count_string" input_types = [ColumnSchema(logical_type=NaturalLanguage)] return_type = ColumnSchema(logical_type=Integer, semantic_tags={'numeric'}) def __init__(self, string='the', ignore_case=True, ignore_non_alphanumeric=False, is_regex=False, match_whole_words_only=False): self.string = string self.ignore_case = ignore_case self.ignore_non_alphanumeric = ignore_non_alphanumeric self.match_whole_words_only = match_whole_words_only self.is_regex = is_regex # we don't want to strip non alphanumeric characters from the pattern # ie h.ll. should match "hello" so we can't strip the dots to make hll if not is_regex: self.pattern = re.escape(self.process_text(string)) else: self.pattern = string if ignore_case: self.pattern = self.pattern.lower() # \b\b.*\b\b is the same as \b.*\b so we don't have to check if # the pattern is given to us as regex and if it already has leading # and trailing \b's if match_whole_words_only: self.pattern = "\\b" + self.pattern + "\\b" def process_text(self, text): if self.ignore_non_alphanumeric: text = re.sub('[^0-9a-zA-Z ]+', '', text) if self.ignore_case: text = text.lower() return text def get_function(self): def count_string(words): if type(words) != str: return np.nan words = self.process_text(words) return len(re.findall(self.pattern, words)) return np.vectorize(count_string)