def test_schema_equality(): col = ColumnSchema(logical_type=Categorical) diff_description_col = ColumnSchema(logical_type=Categorical, description='description') diff_metadata_col = ColumnSchema(logical_type=Categorical, metadata={'interesting_values': ['a', 'b']}) use_standard_tags_col = ColumnSchema(logical_type=Categorical, use_standard_tags=True) diff_tags_col = ColumnSchema(logical_type=Categorical, semantic_tags={'new_tag'}) assert col != diff_description_col assert col != diff_metadata_col assert col != use_standard_tags_col assert col != diff_tags_col # Check columns with same logical types but different parameters ordinal_ltype_1 = Ordinal(order=['a', 'b', 'c']) ordinal_ltype_2 = Ordinal(order=['b', 'a', 'c']) ordinal_col_1 = ColumnSchema(logical_type=ordinal_ltype_1) ordinal_col_2 = ColumnSchema(logical_type=ordinal_ltype_2) assert col != ordinal_col_1 assert ordinal_col_1 != ordinal_col_2 assert ordinal_col_1 == ordinal_col_1 datetime_ltype_instantiated = Datetime(datetime_format='%Y-%m%d') datetime_col_format = ColumnSchema(logical_type=datetime_ltype_instantiated) datetime_col_param = ColumnSchema(logical_type=Datetime(datetime_format=None)) datetime_col_instantiated = ColumnSchema(logical_type=Datetime()) datetime_col = ColumnSchema(logical_type=Datetime) assert datetime_col != datetime_col_instantiated assert datetime_col_instantiated != datetime_col_format assert datetime_col_instantiated == datetime_col_param
def test_ordinal_transform_dask(ordinal_transform_series_dask) -> None: order = [2, 1, 3] typ = Ordinal(order=order) ser_ = typ.transform(ordinal_transform_series_dask).compute() assert ser_.dtype == "category" pd.testing.assert_index_equal(ser_.cat.categories, pd.Int64Index(order))
def test_categorical_dtype_serialization(serialize_df, tmpdir): ltypes = { "cat_int": Categorical, "ord_int": Ordinal(order=[1, 2]), "cat_float": Categorical, "ord_float": Ordinal(order=[1.0, 2.0]), "cat_bool": Categorical, "ord_bool": Ordinal(order=[True, False]), } if isinstance(serialize_df, pd.DataFrame): formats = ["csv", "pickle", "parquet"] else: formats = ["csv"] for format in formats: df = serialize_df.copy() df.ww.init(index="id", logical_types=ltypes) df.ww.to_disk(str(tmpdir), format=format) deserialized_df = read_woodwork_table(str(tmpdir), filename=f"data.{format}") pd.testing.assert_frame_equal( to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True), to_pandas(df, index=df.ww.index, sort_index=True), ) assert deserialized_df.ww.schema == df.ww.schema shutil.rmtree(str(tmpdir))
def test_ordinal_init_with_order(): order = ['bronze', 'silver', 'gold'] ordinal_from_list = Ordinal(order=order) assert ordinal_from_list.order == order order = ('bronze', 'silver', 'gold') ordinal_from_tuple = Ordinal(order=order) assert ordinal_from_tuple.order == order
def test_ordinal_order_errors(): with pytest.raises( TypeError, match='Order values must be specified in a list or tuple'): Ordinal(order='not_valid') with pytest.raises(ValueError, match='Order values cannot contain duplicates'): Ordinal(order=['a', 'b', 'b'])
def test_ordinal_init_with_order(): order = ["bronze", "silver", "gold"] ordinal_from_list = Ordinal(order=order) assert ordinal_from_list.order == order assert str(ordinal_from_list) == "Ordinal: ['bronze', 'silver', 'gold']" order = ("bronze", "silver", "gold") ordinal_from_tuple = Ordinal(order=order) assert ordinal_from_tuple.order == order assert str(ordinal_from_list) == "Ordinal: ['bronze', 'silver', 'gold']"
def test_ordinal_order_errors(): series = pd.Series([1, 2, 3]).astype("category") with pytest.raises( TypeError, match="Order values must be specified in a list or tuple"): Ordinal(order="not_valid").transform(series) with pytest.raises(ValueError, match="Order values cannot contain duplicates"): Ordinal(order=["a", "b", "b"]).transform(series)
def test_sets_category_dtype_on_init(): column_name = 'test_series' series_list = [ pd.Series(['a', 'b', 'c'], name=column_name), pd.Series(['a', None, 'c'], name=column_name), pd.Series(['a', np.nan, 'c'], name=column_name), pd.Series(['a', pd.NA, 'c'], name=column_name), pd.Series(['a', pd.NaT, 'c'], name=column_name), ] logical_types = [ Categorical, CountryCode, Ordinal(order=['a', 'b', 'c']), SubRegionCode, ZIPCode, ] for series in series_list: series = series.astype('object') for logical_type in logical_types: ltypes = { column_name: logical_type, } dt = DataTable(pd.DataFrame(series), logical_types=ltypes) assert dt.columns[column_name].logical_type == logical_type assert dt.columns[column_name].dtype == logical_type.pandas_dtype assert dt.to_dataframe( )[column_name].dtype == logical_type.pandas_dtype
def test_to_csv(sample_df, tmpdir): sample_df.ww.init(name='test_data', index='id', semantic_tags={'id': 'tag1'}, logical_types={'age': Ordinal(order=[25, 33, 57])}, column_descriptions={ 'signup_date': 'original signup date', 'age': 'age of the user' }, column_metadata={ 'id': { 'is_sorted': True }, 'age': { 'interesting_values': [33, 57] } }) sample_df.ww.to_csv(str(tmpdir), encoding='utf-8', engine='python') deserialized_df = deserialize.read_woodwork_table(str(tmpdir)) pd.testing.assert_frame_equal( to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True), to_pandas(sample_df, index=sample_df.ww.index, sort_index=True)) assert deserialized_df.ww.schema == sample_df.ww.schema
class Week(TransformPrimitive): """Determines the week of the year from a datetime. Description: Returns the week of the year from a datetime value. The first week of the year starts on January 1, and week numbers increment each Monday. Examples: >>> from datetime import datetime >>> dates = [datetime(2019, 1, 3), ... datetime(2019, 6, 17, 11, 10, 50), ... datetime(2019, 11, 30, 19, 45, 15)] >>> week = Week() >>> week(dates).tolist() [1, 25, 48] """ name = "week" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(logical_type=Ordinal(order=list(range(1, 54))), semantic_tags={"category"}) compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] description_template = "the week of the year of {}" def get_function(self): def week(vals): if hasattr(vals.dt, "isocalendar"): return vals.dt.isocalendar().week else: return vals.dt.week return week
class Week(TransformPrimitive): """Determines the week of the year from a datetime. Description: Returns the week of the year from a datetime value. The first week of the year starts on January 1, and week numbers increment each Monday. Examples: >>> from datetime import datetime >>> dates = [datetime(2019, 1, 3), ... datetime(2019, 6, 17, 11, 10, 50), ... datetime(2019, 11, 30, 19, 45, 15)] >>> week = Week() >>> week(dates).tolist() [1, 25, 48] """ name = "week" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(logical_type=Ordinal(order=list(range(1, 54))), semantic_tags={'category'}) compatibility = [Library.PANDAS, Library.DASK, Library.KOALAS] description_template = "the week of the year of {}" def get_function(self): def week(vals): warnings.filterwarnings( "ignore", message=("Series.dt.weekofyear and Series.dt.week " "have been deprecated."), module="featuretools") return vals.dt.week return week
class Year(TransformPrimitive): """Determines the year value of a datetime. Examples: >>> from datetime import datetime >>> dates = [datetime(2019, 3, 1), ... datetime(2048, 6, 17, 11, 10, 50), ... datetime(1950, 11, 30, 19, 45, 15)] >>> year = Year() >>> year(dates).tolist() [2019, 2048, 1950] """ name = "year" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema( logical_type=Ordinal(order=list(range(1, 3000))), semantic_tags={"category"}) compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] description_template = "the year of {}" def get_function(self): def year(vals): return vals.dt.year return year
class DayOfYear(TransformPrimitive): """Determines the ordinal day of the year from the given datetime Description: For a list of dates, return the ordinal day of the year from the given datetime. Examples: >>> from datetime import datetime >>> dates = [datetime(2019, 1, 1), ... datetime(2020, 12, 31), ... datetime(2020, 2, 28)] >>> dayOfYear = DayOfYear() >>> dayOfYear(dates).tolist() [1, 366, 59] """ name = "day_of_year" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(logical_type=Ordinal(order=list(range(1, 367))), semantic_tags={"category"}) compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] description_template = "the day of year from {}" def get_function(self): def dayOfYear(vals): return vals.dt.dayofyear return dayOfYear
class Second(TransformPrimitive): """Determines the seconds value of a datetime. Examples: >>> from datetime import datetime >>> dates = [datetime(2019, 3, 1), ... datetime(2019, 3, 3, 11, 10, 50), ... datetime(2019, 3, 31, 19, 45, 15)] >>> second = Second() >>> second(dates).tolist() [0, 50, 15] """ name = "second" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(logical_type=Ordinal(order=list(range(60))), semantic_tags={"category"}) compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] description_template = "the seconds value of {}" def get_function(self): def second(vals): return vals.dt.second return second
class Weekday(TransformPrimitive): """Determines the day of the week from a datetime. Description: Returns the day of the week from a datetime value. Weeks start on Monday (day 0) and run through Sunday (day 6). Examples: >>> from datetime import datetime >>> dates = [datetime(2019, 3, 1), ... datetime(2019, 6, 17, 11, 10, 50), ... datetime(2019, 11, 30, 19, 45, 15)] >>> weekday = Weekday() >>> weekday(dates).tolist() [4, 0, 5] """ name = "weekday" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(logical_type=Ordinal(order=list(range(7))), semantic_tags={"category"}) compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] description_template = "the day of the week of {}" def get_function(self): def weekday(vals): return vals.dt.weekday return weekday
class Day(TransformPrimitive): """Determines the day of the month from a datetime. Examples: >>> from datetime import datetime >>> dates = [datetime(2019, 3, 1), ... datetime(2019, 3, 3), ... datetime(2019, 3, 31)] >>> day = Day() >>> day(dates).tolist() [1, 3, 31] """ name = "day" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(logical_type=Ordinal(order=list(range(1, 32))), semantic_tags={"category"}) compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] description_template = "the day of the month of {}" def get_function(self): def day(vals): return vals.dt.day return day
class Month(TransformPrimitive): """Determines the month value of a datetime. Examples: >>> from datetime import datetime >>> dates = [datetime(2019, 3, 1), ... datetime(2019, 6, 17, 11, 10, 50), ... datetime(2019, 11, 30, 19, 45, 15)] >>> month = Month() >>> month(dates).tolist() [3, 6, 11] """ name = "month" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(logical_type=Ordinal(order=list(range(1, 13))), semantic_tags={"category"}) compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] description_template = "the month of {}" def get_function(self): def month(vals): return vals.dt.month return month
def test_to_csv_S3(sample_df, s3_client, s3_bucket, profile_name): xfail_tmp_disappears(sample_df) sample_df.ww.init( name="test_data", index="id", semantic_tags={"id": "tag1"}, logical_types={"age": Ordinal(order=[25, 33, 57])}, ) sample_df.ww.to_disk( TEST_S3_URL, format="csv", encoding="utf-8", engine="python", profile_name=profile_name, ) make_public(s3_client, s3_bucket) deserialized_df = read_woodwork_table(TEST_S3_URL, profile_name=profile_name) pd.testing.assert_frame_equal( to_pandas(sample_df, index=sample_df.ww.index, sort_index=True), to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True), ) assert sample_df.ww.schema == deserialized_df.ww.schema
class Quarter(TransformPrimitive): """Determines the quarter a datetime column falls into (1, 2, 3, 4) Examples: >>> from datetime import datetime >>> dates = [datetime(2019,12,1), ... datetime(2019,1,3), ... datetime(2020,2,1)] >>> q = Quarter() >>> q(dates).tolist() [4, 1, 1] """ name = "quarter" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(logical_type=Ordinal(order=list(range(1, 5))), semantic_tags={"category"}) compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] description_template = "the quarter that describes {}" def get_function(self): def quarter(vals): return vals.dt.quarter return quarter
def test_datacolumn_equality(sample_series, sample_datetime_series): # Check different parameters to DataColumn str_col = DataColumn(sample_series, logical_type='Categorical') str_col_2 = DataColumn(sample_series, logical_type=Categorical) str_col_diff_tags = DataColumn(sample_series, logical_type=Categorical, semantic_tags={'test'}) diff_name_col = DataColumn(sample_datetime_series, logical_type=Categorical) diff_dtype_col = DataColumn(sample_series, logical_type=NaturalLanguage) diff_description_col = DataColumn(sample_series, logical_type='Categorical', description='description') diff_metadata_col = DataColumn(sample_series, logical_type='Categorical', metadata={'interesting_values': ['a', 'b']}) assert str_col == str_col_2 assert str_col != str_col_diff_tags assert str_col != diff_name_col assert str_col != diff_dtype_col assert str_col != diff_description_col assert str_col != diff_metadata_col # Check columns with same logical types but different parameters ordinal_ltype_1 = Ordinal(order=['a', 'b', 'c']) ordinal_ltype_2 = Ordinal(order=['b', 'a', 'c']) ordinal_col_1 = DataColumn(sample_series, logical_type=ordinal_ltype_1) ordinal_col_2 = DataColumn(sample_series, logical_type=ordinal_ltype_2) assert str_col != ordinal_col_1 assert ordinal_col_1 != ordinal_col_2 assert ordinal_col_1 == ordinal_col_1 datetime_ltype_instantiated = Datetime(datetime_format='%Y-%m%d') datetime_col_format = DataColumn(sample_datetime_series, logical_type=datetime_ltype_instantiated) datetime_col_param = DataColumn(sample_datetime_series, logical_type=Datetime(datetime_format=None)) datetime_col_instantiated = DataColumn(sample_datetime_series, logical_type=Datetime()) datetime_col = DataColumn(sample_datetime_series, logical_type=Datetime) assert datetime_col != datetime_col_instantiated assert datetime_col_instantiated != datetime_col_format assert datetime_col_instantiated == datetime_col_param # Check different underlying series str_col = DataColumn(sample_series, logical_type='NaturalLanguage') changed_series = sample_series.copy().replace(to_replace='a', value='test') null_col = DataColumn(changed_series, logical_type='NaturalLanguage') # We only check underlying data for equality with pandas dataframes if isinstance(str_col.to_series(), pd.Series): assert str_col != null_col else: assert str_col == null_col
def test_ordinal_transform(sample_series): series_type = str(type(sample_series)) dask = "dask" in series_type spark = "spark" in series_type if dask or spark: pytest.xfail( "Fails with Dask and Spark - ordinal data validation not supported" ) ordinal_incomplete_order = Ordinal(order=["a", "b"]) error_msg = re.escape( "Ordinal column sample_series contains values that are not " "present in the order values provided: ['c']") with pytest.raises(ValueError, match=error_msg): ordinal_incomplete_order.transform(sample_series)
def test_adds_category_standard_tag(): series = pd.Series([1, 2, 3]) semantic_tags = 'custom_tag' logical_types = [Categorical, CountryCode, Ordinal(order=(1, 2, 3)), SubRegionCode, ZIPCode] for logical_type in logical_types: data_col = DataColumn(series, logical_type=logical_type, semantic_tags=semantic_tags) assert data_col.semantic_tags == {'custom_tag', 'category'}
def test_ordinal_validation_methods_called_init(mock_validate, sample_series): assert not mock_validate.called not_validated = sample_series.copy() not_validated.ww.init(logical_type=Ordinal(order=["a", "b", "c"]), validate=False) assert not mock_validate.called validated = sample_series.copy() validated.ww.init(logical_type=Ordinal(order=["a", "b", "c"]), validate=True) assert mock_validate.called assert validated.ww == not_validated.ww pd.testing.assert_series_equal(to_pandas(validated), to_pandas(not_validated))
def test_adds_category_standard_tag(): semantic_tags = 'custom_tag' logical_types = [Categorical, CountryCode, Ordinal(order=(1, 2, 3)), PostalCode, SubRegionCode] for logical_type in logical_types: series = pd.Series([1, 2, 3], dtype='category') series.ww.init(logical_type=logical_type, semantic_tags=semantic_tags) assert series.ww.semantic_tags == {'custom_tag', 'category'}
def test_ordinal_with_incomplete_ranking(sample_series): if (ks and isinstance(sample_series, ks.Series)) or (dd and isinstance(sample_series, dd.Series)): pytest.xfail('Fails with Dask and Koalas - ordinal data validation not supported') ordinal_incomplete_order = Ordinal(order=['a', 'b']) error_msg = re.escape("Ordinal column sample_series contains values that are not " "present in the order values provided: ['c']") with pytest.raises(ValueError, match=error_msg): sample_series.ww.init(logical_type=ordinal_incomplete_order)
def test_ordinal_with_incomplete_ranking(sample_series): if _is_spark_series(sample_series) or _is_dask_series(sample_series): pytest.xfail( "Fails with Dask and Spark - ordinal data validation not supported" ) ordinal_incomplete_order = Ordinal(order=["a", "b"]) error_msg = re.escape( "Ordinal column sample_series contains values that are not " "present in the order values provided: ['c']") with pytest.raises(ValueError, match=error_msg): sample_series.ww.init(logical_type=ordinal_incomplete_order)
def test_is_col_boolean(): boolean_column = ColumnSchema(logical_type=Boolean) assert _is_col_boolean(boolean_column) instantiated_column = ColumnSchema(logical_type=Boolean()) assert _is_col_boolean(instantiated_column) ordinal_column = ColumnSchema(logical_type=Ordinal(order=['a', 'b'])) assert not _is_col_boolean(ordinal_column) nl_column = ColumnSchema(logical_type=NaturalLanguage) assert not _is_col_boolean(nl_column)
def test_all_ww_logical_types(): logical_types = list_logical_types()['type_string'].to_list() dataframe = pd.DataFrame(columns=logical_types) es = EntitySet() ltype_dict = {ltype: ltype for ltype in logical_types} ltype_dict['ordinal'] = Ordinal(order=[]) es.add_dataframe(dataframe=dataframe, dataframe_name='all_types', index='integer', logical_types=ltype_dict) description = serialize.entityset_to_description(es) _es = deserialize.description_to_entityset(description) assert es.__eq__(_es, deep=True)
def test_ordinal_with_order(sample_series): if (ks and isinstance(sample_series, ks.Series)) or (dd and isinstance(sample_series, dd.Series)): pytest.xfail('Fails with Dask and Koalas - ordinal data validation not compatible') ordinal_with_order = Ordinal(order=['a', 'b', 'c']) dc = DataColumn(sample_series, logical_type=ordinal_with_order) assert isinstance(dc.logical_type, Ordinal) assert dc.logical_type.order == ['a', 'b', 'c'] dc = DataColumn(sample_series, logical_type="NaturalLanguage") new_dc = dc.set_logical_type(ordinal_with_order) assert isinstance(new_dc.logical_type, Ordinal) assert new_dc.logical_type.order == ['a', 'b', 'c']
def test_schema_equality(): col = ColumnSchema(logical_type=Categorical) diff_description_col = ColumnSchema(logical_type=Categorical, description="description") diff_origin_col = ColumnSchema(logical_type=Categorical, origin="base") diff_metadata_col = ColumnSchema( logical_type=Categorical, metadata={"interesting_values": ["a", "b"]}) use_standard_tags_col = ColumnSchema(logical_type=Categorical, use_standard_tags=True) diff_tags_col = ColumnSchema(logical_type=Categorical, semantic_tags={"new_tag"}) assert col != diff_description_col assert col != diff_origin_col assert col != diff_metadata_col assert col != use_standard_tags_col assert col != diff_tags_col # Check columns with same logical types but different parameters ordinal_ltype_1 = Ordinal(order=["a", "b", "c"]) ordinal_ltype_2 = Ordinal(order=["b", "a", "c"]) ordinal_col_1 = ColumnSchema(logical_type=ordinal_ltype_1) ordinal_col_2 = ColumnSchema(logical_type=ordinal_ltype_2) assert col != ordinal_col_1 assert ordinal_col_1 != ordinal_col_2 assert ordinal_col_1 == ordinal_col_1 datetime_ltype_instantiated = Datetime(datetime_format="%Y-%m%d") datetime_col_format = ColumnSchema( logical_type=datetime_ltype_instantiated) datetime_col_param = ColumnSchema(logical_type=Datetime( datetime_format=None)) datetime_col_instantiated = ColumnSchema(logical_type=Datetime()) assert datetime_col_instantiated != datetime_col_format assert datetime_col_instantiated == datetime_col_param