def test_to_disk_with_whitespace(whitespace_df, tmpdir, format): df = whitespace_df.copy() df.ww.init(index="id", logical_types={"comments": "NaturalLanguage"}) if format == "pickle" and not isinstance(df, pd.DataFrame): msg = "DataFrame type not compatible with pickle serialization. Please serialize to another format." with pytest.raises(ValueError, match=msg): df.ww.to_disk(str(tmpdir), format="pickle") else: df.ww.to_disk(str(tmpdir), format=format) if format == "parquet": filename = "data.parquet" format = None if _is_dask_dataframe(whitespace_df) or _is_spark_dataframe( whitespace_df): filename = None format = "parquet" deserialized_df = read_woodwork_table( path=str(tmpdir), filename=filename, format=format, ) else: deserialized_df = read_woodwork_table(str(tmpdir)) assert deserialized_df.ww.schema == df.ww.schema pd.testing.assert_frame_equal( to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True), to_pandas(df, index=df.ww.index, sort_index=True), )
def test_to_csv_use_standard_tags(sample_df, tmpdir): no_standard_tags_df = sample_df.copy() no_standard_tags_df.ww.init(use_standard_tags=False) no_standard_tags_df.ww.to_disk(str(tmpdir), format="csv", encoding="utf-8", engine="python") deserialized_no_tags_df = read_woodwork_table(str(tmpdir)) shutil.rmtree(str(tmpdir)) standard_tags_df = sample_df.copy() standard_tags_df.ww.init(use_standard_tags=True) standard_tags_df.ww.to_disk(str(tmpdir), format="csv", encoding="utf-8", engine="python") deserialized_tags_df = read_woodwork_table(str(tmpdir)) shutil.rmtree(str(tmpdir)) assert no_standard_tags_df.ww.schema != standard_tags_df.ww.schema assert deserialized_no_tags_df.ww.schema == no_standard_tags_df.ww.schema assert deserialized_tags_df.ww.schema == standard_tags_df.ww.schema
def test_to_disk_parquet_no_file_extension(sample_df, tmpdir): if _is_dask_dataframe(sample_df) or _is_spark_dataframe(sample_df): pytest.skip( "Specifying filename for writing Dask or Spark DataFrames to parquet is not supported." ) sample_df.ww.init(index="id") sample_df.ww.to_disk(str(tmpdir), filename="parquet_data", format="parquet") error_msg = "Could not determine format. Please specify filename and/or format." # Without specifying format, WW doens't know what type of file this is with pytest.raises(ValueError, match=error_msg): deserialized_df = read_woodwork_table( str(tmpdir), filename="parquet_data", ) deserialized_df = read_woodwork_table( str(tmpdir), filename="parquet_data", format="parquet", ) pd.testing.assert_frame_equal( to_pandas(sample_df, index=sample_df.ww.index, sort_index=True), to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True), )
def test_categorical_dtype_serialization(serialize_df, tmpdir): ltypes = { "cat_int": Categorical, "ord_int": Ordinal(order=[1, 2]), "cat_float": Categorical, "ord_float": Ordinal(order=[1.0, 2.0]), "cat_bool": Categorical, "ord_bool": Ordinal(order=[True, False]), } if isinstance(serialize_df, pd.DataFrame): formats = ["csv", "pickle", "parquet"] else: formats = ["csv"] for format in formats: df = serialize_df.copy() df.ww.init(index="id", logical_types=ltypes) df.ww.to_disk(str(tmpdir), format=format) deserialized_df = read_woodwork_table(str(tmpdir), filename=f"data.{format}") pd.testing.assert_frame_equal( to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True), to_pandas(df, index=df.ww.index, sort_index=True), ) assert deserialized_df.ww.schema == df.ww.schema shutil.rmtree(str(tmpdir))
def test_to_csv_S3(sample_df, s3_client, s3_bucket, profile_name): xfail_tmp_disappears(sample_df) sample_df.ww.init( name="test_data", index="id", semantic_tags={"id": "tag1"}, logical_types={"age": Ordinal(order=[25, 33, 57])}, ) sample_df.ww.to_disk( TEST_S3_URL, format="csv", encoding="utf-8", engine="python", profile_name=profile_name, ) make_public(s3_client, s3_bucket) deserialized_df = read_woodwork_table(TEST_S3_URL, profile_name=profile_name) pd.testing.assert_frame_equal( to_pandas(sample_df, index=sample_df.ww.index, sort_index=True), to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True), ) assert sample_df.ww.schema == deserialized_df.ww.schema
def test_to_csv(sample_df, tmpdir): sample_df.ww.init(name='test_data', index='id', semantic_tags={'id': 'tag1'}, logical_types={'age': Ordinal(order=[25, 33, 57])}, column_descriptions={ 'signup_date': 'original signup date', 'age': 'age of the user' }, column_metadata={ 'id': { 'is_sorted': True }, 'age': { 'interesting_values': [33, 57] } }) sample_df.ww.to_csv(str(tmpdir), encoding='utf-8', engine='python') deserialized_df = deserialize.read_woodwork_table(str(tmpdir)) pd.testing.assert_frame_equal( to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True), to_pandas(sample_df, index=sample_df.ww.index, sort_index=True)) assert deserialized_df.ww.schema == sample_df.ww.schema
def description_to_entityset(description, **kwargs): '''Deserialize entityset from data description. Args: description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description` kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method. Returns: entityset (EntitySet) : Instance of :class:`.EntitySet`. ''' check_schema_version(description, 'entityset') from featuretools.entityset import EntitySet # If data description was not read from disk, path is None. path = description.get('path') entityset = EntitySet(description['id']) for df in description['dataframes'].values(): if path is not None: data_path = os.path.join(path, 'data', df['name']) dataframe = read_woodwork_table(data_path, validate=False, **kwargs) else: dataframe = empty_dataframe(df) entityset.add_dataframe(dataframe) for relationship in description['relationships']: rel = Relationship.from_dictionary(relationship, entityset) entityset.add_relationship(relationship=rel) return entityset
def test_deserialize_s3_csv(sample_df_pandas): sample_df_pandas.ww.init(index='id') deserialized_df = deserialize.read_woodwork_table(S3_URL) pd.testing.assert_frame_equal( to_pandas(sample_df_pandas, index=sample_df_pandas.ww.index), to_pandas(deserialized_df, index=deserialized_df.ww.index)) assert sample_df_pandas.ww.schema == deserialized_df.ww.schema
def test_deserialize_url_csv(sample_df_pandas, profile_name): sample_df_pandas.ww.init(index="id") deserialized_df = read_woodwork_table(URL, profile_name=profile_name) pd.testing.assert_frame_equal( to_pandas(sample_df_pandas, index=sample_df_pandas.ww.index), to_pandas(deserialized_df, index=deserialized_df.ww.index), ) assert sample_df_pandas.ww.schema == deserialized_df.ww.schema
def test_deserialize_url_csv_anon(sample_df_pandas): sample_df_pandas.ww.init(index='id') deserialized_df = deserialize.read_woodwork_table(URL, profile_name=False) pd.testing.assert_frame_equal( to_pandas(sample_df_pandas, index=sample_df_pandas.ww.index), to_pandas(deserialized_df, index=deserialized_df.ww.index)) assert sample_df_pandas.ww.schema == deserialized_df.ww.schema
def test_to_parquet(sample_df, tmpdir): sample_df.ww.init(index='id') sample_df.ww.to_parquet(str(tmpdir)) deserialized_df = deserialize.read_woodwork_table(str(tmpdir)) pd.testing.assert_frame_equal( to_pandas(sample_df, index=sample_df.ww.index, sort_index=True), to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True)) assert sample_df.ww.schema == deserialized_df.ww.schema
def test_deserialize_handles_indexes(sample_df, tmpdir): sample_df.ww.init( name="test_data", index="id", time_index="signup_date", ) sample_df.ww.to_disk(str(tmpdir), format="csv") deserialized_df = read_woodwork_table(str(tmpdir)) assert deserialized_df.ww.index == "id" assert deserialized_df.ww.time_index == "signup_date"
def test_to_csv_use_standard_tags(sample_df, tmpdir): no_standard_tags_df = sample_df.copy() no_standard_tags_df.ww.init(use_standard_tags=False) no_standard_tags_df.ww.to_csv(str(tmpdir), encoding='utf-8', engine='python') deserialized_no_tags_df = deserialize.read_woodwork_table(str(tmpdir)) standard_tags_df = sample_df.copy() standard_tags_df.ww.init(use_standard_tags=True) standard_tags_df.ww.to_csv(str(tmpdir), encoding='utf-8', engine='python') deserialized_tags_df = deserialize.read_woodwork_table(str(tmpdir)) assert no_standard_tags_df.ww.schema != standard_tags_df.ww.schema assert deserialized_no_tags_df.ww.schema == no_standard_tags_df.ww.schema assert deserialized_tags_df.ww.schema == standard_tags_df.ww.schema
def test_serialize_s3_pickle_anon(sample_df_pandas, s3_client, s3_bucket): sample_df_pandas.ww.init() sample_df_pandas.ww.to_pickle(TEST_S3_URL, profile_name=False) make_public(s3_client, s3_bucket) deserialized_df = deserialize.read_woodwork_table(TEST_S3_URL, profile_name=False) pd.testing.assert_frame_equal( to_pandas(sample_df_pandas, index=sample_df_pandas.ww.index), to_pandas(deserialized_df, index=deserialized_df.ww.index)) assert sample_df_pandas.ww.schema == deserialized_df.ww.schema
def test_to_disk(sample_df, tmpdir, file_format): if file_format in ("arrow", "feather") and not isinstance(sample_df, pd.DataFrame): pytest.xfail( "Arrow IPC format (Feather) not supported on Dask or Spark") sample_df.ww.init(index="id") error_msg = None if file_format == "orc" and _is_dask_dataframe(sample_df): error_msg = "DataFrame type not compatible with orc serialization. Please serialize to another format." error_type = ValueError elif file_format == "pickle" and not isinstance(sample_df, pd.DataFrame): error_msg = "DataFrame type not compatible with pickle serialization. Please serialize to another format." error_type = ValueError if error_msg: with pytest.raises(error_type, match=error_msg): sample_df.ww.to_disk(str(tmpdir), format=file_format) else: sample_df.ww.to_disk(str(tmpdir), format=file_format) if file_format == "parquet": filename = "data.parquet" format = None if _is_dask_dataframe(sample_df) or _is_spark_dataframe(sample_df): filename = None format = "parquet" deserialized_df = read_woodwork_table( path=str(tmpdir), filename=filename, format=format, ) else: deserialized_df = read_woodwork_table(str(tmpdir)) pd.testing.assert_frame_equal( to_pandas(sample_df, index=sample_df.ww.index, sort_index=True), to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True), ) assert sample_df.ww.schema == deserialized_df.ww.schema
def test_to_parquet_with_latlong(latlong_df, tmpdir): latlong_df.ww.init( logical_types={col: 'LatLong' for col in latlong_df.columns}) latlong_df.ww.to_parquet(str(tmpdir)) deserialized_df = deserialize.read_woodwork_table(str(tmpdir)) pd.testing.assert_frame_equal( to_pandas(latlong_df, index=latlong_df.ww.index, sort_index=True), to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True)) assert latlong_df.ww.schema == deserialized_df.ww.schema
def test_to_disk_custom_data_filename(sample_df, tmpdir, file_format): if file_format in ("arrow", "feather") and not isinstance(sample_df, pd.DataFrame): pytest.xfail( "Arrow IPC format (Feather) not supported on Dask or Spark") sample_df.ww.init(index="id") error_msg = None if file_format == "orc" and _is_dask_dataframe(sample_df): error_msg = "DataFrame type not compatible with orc serialization. Please serialize to another format." error_type = ValueError elif file_format == "pickle" and not isinstance(sample_df, pd.DataFrame): error_msg = "DataFrame type not compatible with pickle serialization. Please serialize to another format." error_type = ValueError elif file_format == "parquet" and _is_dask_dataframe(sample_df): error_msg = "Writing a Dask dataframe to parquet with a filename specified is not supported" error_type = ValueError elif file_format == "csv" and _is_spark_dataframe(sample_df): error_msg = "Writing a Spark dataframe to csv with a filename specified is not supported" error_type = ValueError elif file_format == "parquet" and _is_spark_dataframe(sample_df): error_msg = "Writing a Spark dataframe to parquet with a filename specified is not supported" error_type = ValueError data_filename = f"custom_data.{file_format}" filename_to_check = data_filename if _is_dask_dataframe(sample_df): data_filename = f"custom_data-*.{file_format}" filename_to_check = f"custom_data-0.{file_format}" if error_msg: with pytest.raises(error_type, match=error_msg): sample_df.ww.to_disk(path=str(tmpdir), format=file_format, filename=data_filename) else: sample_df.ww.to_disk(path=str(tmpdir), format=file_format, filename=data_filename) assert os.path.isfile(os.path.join(tmpdir, "data", filename_to_check)) deserialized_df = read_woodwork_table( path=str(tmpdir), filename=data_filename, ) pd.testing.assert_frame_equal( to_pandas(sample_df, index=sample_df.ww.index, sort_index=True), to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True), ) assert sample_df.ww.schema == deserialized_df.ww.schema
def test_serialize_s3_parquet(sample_df, s3_client, s3_bucket): xfail_tmp_disappears(sample_df) sample_df.ww.init() sample_df.ww.to_parquet(TEST_S3_URL) make_public(s3_client, s3_bucket) deserialized_df = deserialize.read_woodwork_table(TEST_S3_URL) pd.testing.assert_frame_equal( to_pandas(sample_df, index=sample_df.ww.index, sort_index=True), to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True)) assert sample_df.ww.schema == deserialized_df.ww.schema
def test_s3_test_profile(sample_df, s3_client, s3_bucket, setup_test_profile): xfail_tmp_disappears(sample_df) sample_df.ww.init() sample_df.ww.to_csv(TEST_S3_URL, encoding='utf-8', engine='python', profile_name='test') make_public(s3_client, s3_bucket) deserialized_df = deserialize.read_woodwork_table(TEST_S3_URL, profile_name='test') pd.testing.assert_frame_equal( to_pandas(sample_df, index=sample_df.ww.index), to_pandas(deserialized_df, index=deserialized_df.ww.index)) assert sample_df.ww.schema == deserialized_df.ww.schema
def test_to_pickle(sample_df, tmpdir): sample_df.ww.init() if not isinstance(sample_df, pd.DataFrame): msg = 'DataFrame type not compatible with pickle serialization. Please serialize to another format.' with pytest.raises(ValueError, match=msg): sample_df.ww.to_pickle(str(tmpdir)) else: sample_df.ww.to_pickle(str(tmpdir)) deserialized_df = deserialize.read_woodwork_table(str(tmpdir)) pd.testing.assert_frame_equal( to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True), to_pandas(sample_df, index=sample_df.ww.index, sort_index=True)) assert deserialized_df.ww.schema == sample_df.ww.schema
def test_to_csv_S3(sample_df, s3_client, s3_bucket): xfail_tmp_disappears(sample_df) sample_df.ww.init(name='test_data', index='id', semantic_tags={'id': 'tag1'}, logical_types={'age': Ordinal(order=[25, 33, 57])}) sample_df.ww.to_csv(TEST_S3_URL, encoding='utf-8', engine='python') make_public(s3_client, s3_bucket) deserialized_df = deserialize.read_woodwork_table(TEST_S3_URL) pd.testing.assert_frame_equal( to_pandas(sample_df, index=sample_df.ww.index, sort_index=True), to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True)) assert sample_df.ww.schema == deserialized_df.ww.schema
def test_to_pickle_with_latlong(latlong_df, tmpdir): latlong_df.ww.init( logical_types={col: 'LatLong' for col in latlong_df.columns}) if not isinstance(latlong_df, pd.DataFrame): msg = 'DataFrame type not compatible with pickle serialization. Please serialize to another format.' with pytest.raises(ValueError, match=msg): latlong_df.ww.to_pickle(str(tmpdir)) else: latlong_df.ww.to_pickle(str(tmpdir)) deserialized_df = deserialize.read_woodwork_table(str(tmpdir)) pd.testing.assert_frame_equal( to_pandas(latlong_df, index=latlong_df.ww.index, sort_index=True), to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True)) assert latlong_df.ww.schema == deserialized_df.ww.schema
def test_s3_test_profile(sample_df, s3_client, s3_bucket, setup_test_profile): xfail_tmp_disappears(sample_df) sample_df.ww.init() sample_df.ww.to_disk( TEST_S3_URL, format="csv", encoding="utf-8", engine="python", profile_name="test", ) make_public(s3_client, s3_bucket) deserialized_df = read_woodwork_table(TEST_S3_URL, profile_name="test") pd.testing.assert_frame_equal( to_pandas(sample_df, index=sample_df.ww.index), to_pandas(deserialized_df, index=deserialized_df.ww.index), ) assert sample_df.ww.schema == deserialized_df.ww.schema
def test_serialize_s3_parquet(sample_df, s3_client, s3_bucket, profile_name): xfail_tmp_disappears(sample_df) sample_df.ww.init() sample_df.ww.to_disk(TEST_S3_URL, format="parquet", profile_name=profile_name) make_public(s3_client, s3_bucket) deserialized_df = read_woodwork_table(TEST_S3_URL, filename="data.parquet", profile_name=profile_name) pd.testing.assert_frame_equal( to_pandas(sample_df, index=sample_df.ww.index, sort_index=True), to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True), ) assert sample_df.ww.schema == deserialized_df.ww.schema
def test_to_disk_with_latlong(latlong_df, tmpdir, file_format): if file_format in ("arrow", "feather") and not isinstance(latlong_df, pd.DataFrame): pytest.xfail( "Arrow IPC format (Feather) not supported on Dask or Spark") latlong_df.ww.init( logical_types={col: "LatLong" for col in latlong_df.columns}) error_msg = None if file_format == "orc" and _is_dask_dataframe(latlong_df): error_msg = "DataFrame type not compatible with orc serialization. Please serialize to another format." error_type = ValueError elif file_format == "pickle" and not isinstance(latlong_df, pd.DataFrame): error_msg = "DataFrame type not compatible with pickle serialization. Please serialize to another format." error_type = ValueError if error_msg: with pytest.raises(error_type, match=error_msg): latlong_df.ww.to_disk(str(tmpdir), format=file_format) else: latlong_df.ww.to_disk(str(tmpdir), format=file_format) filename = None format = None if file_format == "parquet": if _is_dask_dataframe(latlong_df) or _is_spark_dataframe( latlong_df): format = "parquet" else: filename = "data.parquet" deserialized_df = read_woodwork_table(str(tmpdir), filename=filename, format=format) pd.testing.assert_frame_equal( to_pandas(latlong_df, index=latlong_df.ww.index, sort_index=True), to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True), ) assert latlong_df.ww.schema == deserialized_df.ww.schema
def test_serialize_s3_pickle(sample_df_pandas, s3_client, s3_bucket, profile_name): sample_df_pandas.ww.init() sample_df_pandas.ww.to_disk(TEST_S3_URL, format="pickle", profile_name=profile_name) make_public(s3_client, s3_bucket) deserialized_df = read_woodwork_table(TEST_S3_URL, profile_name=profile_name) pd.testing.assert_frame_equal( to_pandas(sample_df_pandas, index=sample_df_pandas.ww.index, sort_index=True), to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True), ) assert sample_df_pandas.ww.schema == deserialized_df.ww.schema
def test_to_disk_parquet_typing_info_file_is_none(sample_df, tmpdir): sample_df.ww.init(index="id") sample_df.ww.to_disk(str(tmpdir), format="parquet") filename = "data.parquet" format = None if _is_dask_dataframe(sample_df) or _is_spark_dataframe(sample_df): filename = None format = "parquet" deserialized_df = read_woodwork_table( str(tmpdir), filename=filename, typing_info_filename=None, format=format, ) pd.testing.assert_frame_equal( to_pandas(sample_df, index=sample_df.ww.index, sort_index=True), to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True), )
def test_to_csv(sample_df, tmpdir): if _is_dask_dataframe(sample_df): # Dask errors with pd.NA in some partitions, but not others sample_df["age"] = sample_df["age"].fillna(25) sample_df.ww.init( name="test_data", index="id", semantic_tags={"id": "tag1"}, logical_types={"age": Ordinal(order=[25, 33, 57])}, column_descriptions={ "signup_date": "original signup date", "age": "age of the user", }, column_origins={ "phone_number": "base", "age": "base", "signup_date": "engineered", }, column_metadata={ "id": { "is_sorted": True }, "age": { "interesting_values": [33, 57] }, }, ) sample_df.ww.to_disk(str(tmpdir), format="csv", encoding="utf-8", engine="python") deserialized_df = read_woodwork_table(str(tmpdir)) pd.testing.assert_frame_equal( to_pandas(deserialized_df, index=deserialized_df.ww.index, sort_index=True), to_pandas(sample_df, index=sample_df.ww.index, sort_index=True), ) assert deserialized_df.ww.schema == sample_df.ww.schema
def description_to_entityset(description, **kwargs): """Deserialize entityset from data description. Args: description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description` kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method. Returns: entityset (EntitySet) : Instance of :class:`.EntitySet`. """ check_schema_version(description, "entityset") from featuretools.entityset import EntitySet # If data description was not read from disk, path is None. path = description.get("path") entityset = EntitySet(description["id"]) for df in description["dataframes"].values(): if path is not None: data_path = os.path.join(path, "data", df["name"]) format = description.get("format") if format is not None: kwargs["format"] = format if format == "parquet" and df["loading_info"][ "table_type"] == "pandas": kwargs["filename"] = df["name"] + ".parquet" dataframe = read_woodwork_table(data_path, validate=False, **kwargs) else: dataframe = empty_dataframe(df) entityset.add_dataframe(dataframe) for relationship in description["relationships"]: rel = Relationship.from_dictionary(relationship, entityset) entityset.add_relationship(relationship=rel) return entityset
def test_deserialize_validation_control(mock_validate_accessor_params): assert not mock_validate_accessor_params.called deserialize.read_woodwork_table(URL) assert not mock_validate_accessor_params.called deserialize.read_woodwork_table(URL, validate=True) assert mock_validate_accessor_params.called