Beispiel #1
0
def test_to_disk_with_whitespace(whitespace_df, tmpdir, format):
    df = whitespace_df.copy()
    df.ww.init(index="id", logical_types={"comments": "NaturalLanguage"})
    if format == "pickle" and not isinstance(df, pd.DataFrame):
        msg = "DataFrame type not compatible with pickle serialization. Please serialize to another format."
        with pytest.raises(ValueError, match=msg):
            df.ww.to_disk(str(tmpdir), format="pickle")
    else:
        df.ww.to_disk(str(tmpdir), format=format)
        if format == "parquet":
            filename = "data.parquet"
            format = None
            if _is_dask_dataframe(whitespace_df) or _is_spark_dataframe(
                    whitespace_df):
                filename = None
                format = "parquet"
            deserialized_df = read_woodwork_table(
                path=str(tmpdir),
                filename=filename,
                format=format,
            )
        else:
            deserialized_df = read_woodwork_table(str(tmpdir))
        assert deserialized_df.ww.schema == df.ww.schema
        pd.testing.assert_frame_equal(
            to_pandas(deserialized_df,
                      index=deserialized_df.ww.index,
                      sort_index=True),
            to_pandas(df, index=df.ww.index, sort_index=True),
        )
Beispiel #2
0
def test_to_csv_use_standard_tags(sample_df, tmpdir):
    no_standard_tags_df = sample_df.copy()
    no_standard_tags_df.ww.init(use_standard_tags=False)

    no_standard_tags_df.ww.to_disk(str(tmpdir),
                                   format="csv",
                                   encoding="utf-8",
                                   engine="python")
    deserialized_no_tags_df = read_woodwork_table(str(tmpdir))
    shutil.rmtree(str(tmpdir))

    standard_tags_df = sample_df.copy()
    standard_tags_df.ww.init(use_standard_tags=True)

    standard_tags_df.ww.to_disk(str(tmpdir),
                                format="csv",
                                encoding="utf-8",
                                engine="python")
    deserialized_tags_df = read_woodwork_table(str(tmpdir))
    shutil.rmtree(str(tmpdir))

    assert no_standard_tags_df.ww.schema != standard_tags_df.ww.schema

    assert deserialized_no_tags_df.ww.schema == no_standard_tags_df.ww.schema
    assert deserialized_tags_df.ww.schema == standard_tags_df.ww.schema
Beispiel #3
0
def test_to_disk_parquet_no_file_extension(sample_df, tmpdir):
    if _is_dask_dataframe(sample_df) or _is_spark_dataframe(sample_df):
        pytest.skip(
            "Specifying filename for writing Dask or Spark DataFrames to parquet is not supported."
        )
    sample_df.ww.init(index="id")
    sample_df.ww.to_disk(str(tmpdir),
                         filename="parquet_data",
                         format="parquet")

    error_msg = "Could not determine format. Please specify filename and/or format."
    # Without specifying format, WW doens't know what type of file this is
    with pytest.raises(ValueError, match=error_msg):
        deserialized_df = read_woodwork_table(
            str(tmpdir),
            filename="parquet_data",
        )

    deserialized_df = read_woodwork_table(
        str(tmpdir),
        filename="parquet_data",
        format="parquet",
    )
    pd.testing.assert_frame_equal(
        to_pandas(sample_df, index=sample_df.ww.index, sort_index=True),
        to_pandas(deserialized_df,
                  index=deserialized_df.ww.index,
                  sort_index=True),
    )
Beispiel #4
0
def test_categorical_dtype_serialization(serialize_df, tmpdir):
    ltypes = {
        "cat_int": Categorical,
        "ord_int": Ordinal(order=[1, 2]),
        "cat_float": Categorical,
        "ord_float": Ordinal(order=[1.0, 2.0]),
        "cat_bool": Categorical,
        "ord_bool": Ordinal(order=[True, False]),
    }
    if isinstance(serialize_df, pd.DataFrame):
        formats = ["csv", "pickle", "parquet"]
    else:
        formats = ["csv"]

    for format in formats:
        df = serialize_df.copy()
        df.ww.init(index="id", logical_types=ltypes)
        df.ww.to_disk(str(tmpdir), format=format)
        deserialized_df = read_woodwork_table(str(tmpdir),
                                              filename=f"data.{format}")
        pd.testing.assert_frame_equal(
            to_pandas(deserialized_df,
                      index=deserialized_df.ww.index,
                      sort_index=True),
            to_pandas(df, index=df.ww.index, sort_index=True),
        )
        assert deserialized_df.ww.schema == df.ww.schema
        shutil.rmtree(str(tmpdir))
Beispiel #5
0
def test_to_csv_S3(sample_df, s3_client, s3_bucket, profile_name):
    xfail_tmp_disappears(sample_df)

    sample_df.ww.init(
        name="test_data",
        index="id",
        semantic_tags={"id": "tag1"},
        logical_types={"age": Ordinal(order=[25, 33, 57])},
    )
    sample_df.ww.to_disk(
        TEST_S3_URL,
        format="csv",
        encoding="utf-8",
        engine="python",
        profile_name=profile_name,
    )
    make_public(s3_client, s3_bucket)

    deserialized_df = read_woodwork_table(TEST_S3_URL,
                                          profile_name=profile_name)

    pd.testing.assert_frame_equal(
        to_pandas(sample_df, index=sample_df.ww.index, sort_index=True),
        to_pandas(deserialized_df,
                  index=deserialized_df.ww.index,
                  sort_index=True),
    )
    assert sample_df.ww.schema == deserialized_df.ww.schema
def test_to_csv(sample_df, tmpdir):
    sample_df.ww.init(name='test_data',
                      index='id',
                      semantic_tags={'id': 'tag1'},
                      logical_types={'age': Ordinal(order=[25, 33, 57])},
                      column_descriptions={
                          'signup_date': 'original signup date',
                          'age': 'age of the user'
                      },
                      column_metadata={
                          'id': {
                              'is_sorted': True
                          },
                          'age': {
                              'interesting_values': [33, 57]
                          }
                      })

    sample_df.ww.to_csv(str(tmpdir), encoding='utf-8', engine='python')
    deserialized_df = deserialize.read_woodwork_table(str(tmpdir))

    pd.testing.assert_frame_equal(
        to_pandas(deserialized_df,
                  index=deserialized_df.ww.index,
                  sort_index=True),
        to_pandas(sample_df, index=sample_df.ww.index, sort_index=True))
    assert deserialized_df.ww.schema == sample_df.ww.schema
Beispiel #7
0
def description_to_entityset(description, **kwargs):
    '''Deserialize entityset from data description.

    Args:
        description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description`
        kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method.

    Returns:
        entityset (EntitySet) : Instance of :class:`.EntitySet`.
    '''
    check_schema_version(description, 'entityset')

    from featuretools.entityset import EntitySet

    # If data description was not read from disk, path is None.
    path = description.get('path')
    entityset = EntitySet(description['id'])

    for df in description['dataframes'].values():
        if path is not None:
            data_path = os.path.join(path, 'data', df['name'])
            dataframe = read_woodwork_table(data_path,
                                            validate=False,
                                            **kwargs)
        else:
            dataframe = empty_dataframe(df)

        entityset.add_dataframe(dataframe)

    for relationship in description['relationships']:
        rel = Relationship.from_dictionary(relationship, entityset)
        entityset.add_relationship(relationship=rel)

    return entityset
def test_deserialize_s3_csv(sample_df_pandas):
    sample_df_pandas.ww.init(index='id')
    deserialized_df = deserialize.read_woodwork_table(S3_URL)

    pd.testing.assert_frame_equal(
        to_pandas(sample_df_pandas, index=sample_df_pandas.ww.index),
        to_pandas(deserialized_df, index=deserialized_df.ww.index))
    assert sample_df_pandas.ww.schema == deserialized_df.ww.schema
Beispiel #9
0
def test_deserialize_url_csv(sample_df_pandas, profile_name):
    sample_df_pandas.ww.init(index="id")
    deserialized_df = read_woodwork_table(URL, profile_name=profile_name)
    pd.testing.assert_frame_equal(
        to_pandas(sample_df_pandas, index=sample_df_pandas.ww.index),
        to_pandas(deserialized_df, index=deserialized_df.ww.index),
    )
    assert sample_df_pandas.ww.schema == deserialized_df.ww.schema
def test_deserialize_url_csv_anon(sample_df_pandas):
    sample_df_pandas.ww.init(index='id')
    deserialized_df = deserialize.read_woodwork_table(URL, profile_name=False)

    pd.testing.assert_frame_equal(
        to_pandas(sample_df_pandas, index=sample_df_pandas.ww.index),
        to_pandas(deserialized_df, index=deserialized_df.ww.index))
    assert sample_df_pandas.ww.schema == deserialized_df.ww.schema
def test_to_parquet(sample_df, tmpdir):
    sample_df.ww.init(index='id')
    sample_df.ww.to_parquet(str(tmpdir))
    deserialized_df = deserialize.read_woodwork_table(str(tmpdir))
    pd.testing.assert_frame_equal(
        to_pandas(sample_df, index=sample_df.ww.index, sort_index=True),
        to_pandas(deserialized_df,
                  index=deserialized_df.ww.index,
                  sort_index=True))
    assert sample_df.ww.schema == deserialized_df.ww.schema
Beispiel #12
0
def test_deserialize_handles_indexes(sample_df, tmpdir):
    sample_df.ww.init(
        name="test_data",
        index="id",
        time_index="signup_date",
    )
    sample_df.ww.to_disk(str(tmpdir), format="csv")
    deserialized_df = read_woodwork_table(str(tmpdir))
    assert deserialized_df.ww.index == "id"
    assert deserialized_df.ww.time_index == "signup_date"
def test_to_csv_use_standard_tags(sample_df, tmpdir):
    no_standard_tags_df = sample_df.copy()
    no_standard_tags_df.ww.init(use_standard_tags=False)

    no_standard_tags_df.ww.to_csv(str(tmpdir),
                                  encoding='utf-8',
                                  engine='python')
    deserialized_no_tags_df = deserialize.read_woodwork_table(str(tmpdir))

    standard_tags_df = sample_df.copy()
    standard_tags_df.ww.init(use_standard_tags=True)

    standard_tags_df.ww.to_csv(str(tmpdir), encoding='utf-8', engine='python')
    deserialized_tags_df = deserialize.read_woodwork_table(str(tmpdir))

    assert no_standard_tags_df.ww.schema != standard_tags_df.ww.schema

    assert deserialized_no_tags_df.ww.schema == no_standard_tags_df.ww.schema
    assert deserialized_tags_df.ww.schema == standard_tags_df.ww.schema
def test_serialize_s3_pickle_anon(sample_df_pandas, s3_client, s3_bucket):
    sample_df_pandas.ww.init()
    sample_df_pandas.ww.to_pickle(TEST_S3_URL, profile_name=False)
    make_public(s3_client, s3_bucket)
    deserialized_df = deserialize.read_woodwork_table(TEST_S3_URL,
                                                      profile_name=False)

    pd.testing.assert_frame_equal(
        to_pandas(sample_df_pandas, index=sample_df_pandas.ww.index),
        to_pandas(deserialized_df, index=deserialized_df.ww.index))
    assert sample_df_pandas.ww.schema == deserialized_df.ww.schema
Beispiel #15
0
def test_to_disk(sample_df, tmpdir, file_format):
    if file_format in ("arrow",
                       "feather") and not isinstance(sample_df, pd.DataFrame):
        pytest.xfail(
            "Arrow IPC format (Feather) not supported on Dask or Spark")

    sample_df.ww.init(index="id")
    error_msg = None
    if file_format == "orc" and _is_dask_dataframe(sample_df):
        error_msg = "DataFrame type not compatible with orc serialization. Please serialize to another format."
        error_type = ValueError
    elif file_format == "pickle" and not isinstance(sample_df, pd.DataFrame):
        error_msg = "DataFrame type not compatible with pickle serialization. Please serialize to another format."
        error_type = ValueError

    if error_msg:
        with pytest.raises(error_type, match=error_msg):
            sample_df.ww.to_disk(str(tmpdir), format=file_format)
    else:
        sample_df.ww.to_disk(str(tmpdir), format=file_format)
        if file_format == "parquet":
            filename = "data.parquet"
            format = None
            if _is_dask_dataframe(sample_df) or _is_spark_dataframe(sample_df):
                filename = None
                format = "parquet"
            deserialized_df = read_woodwork_table(
                path=str(tmpdir),
                filename=filename,
                format=format,
            )
        else:
            deserialized_df = read_woodwork_table(str(tmpdir))
        pd.testing.assert_frame_equal(
            to_pandas(sample_df, index=sample_df.ww.index, sort_index=True),
            to_pandas(deserialized_df,
                      index=deserialized_df.ww.index,
                      sort_index=True),
        )
        assert sample_df.ww.schema == deserialized_df.ww.schema
def test_to_parquet_with_latlong(latlong_df, tmpdir):
    latlong_df.ww.init(
        logical_types={col: 'LatLong'
                       for col in latlong_df.columns})
    latlong_df.ww.to_parquet(str(tmpdir))
    deserialized_df = deserialize.read_woodwork_table(str(tmpdir))

    pd.testing.assert_frame_equal(
        to_pandas(latlong_df, index=latlong_df.ww.index, sort_index=True),
        to_pandas(deserialized_df,
                  index=deserialized_df.ww.index,
                  sort_index=True))
    assert latlong_df.ww.schema == deserialized_df.ww.schema
Beispiel #17
0
def test_to_disk_custom_data_filename(sample_df, tmpdir, file_format):
    if file_format in ("arrow",
                       "feather") and not isinstance(sample_df, pd.DataFrame):
        pytest.xfail(
            "Arrow IPC format (Feather) not supported on Dask or Spark")

    sample_df.ww.init(index="id")
    error_msg = None
    if file_format == "orc" and _is_dask_dataframe(sample_df):
        error_msg = "DataFrame type not compatible with orc serialization. Please serialize to another format."
        error_type = ValueError
    elif file_format == "pickle" and not isinstance(sample_df, pd.DataFrame):
        error_msg = "DataFrame type not compatible with pickle serialization. Please serialize to another format."
        error_type = ValueError
    elif file_format == "parquet" and _is_dask_dataframe(sample_df):
        error_msg = "Writing a Dask dataframe to parquet with a filename specified is not supported"
        error_type = ValueError
    elif file_format == "csv" and _is_spark_dataframe(sample_df):
        error_msg = "Writing a Spark dataframe to csv with a filename specified is not supported"
        error_type = ValueError
    elif file_format == "parquet" and _is_spark_dataframe(sample_df):
        error_msg = "Writing a Spark dataframe to parquet with a filename specified is not supported"
        error_type = ValueError

    data_filename = f"custom_data.{file_format}"
    filename_to_check = data_filename
    if _is_dask_dataframe(sample_df):
        data_filename = f"custom_data-*.{file_format}"
        filename_to_check = f"custom_data-0.{file_format}"

    if error_msg:
        with pytest.raises(error_type, match=error_msg):
            sample_df.ww.to_disk(path=str(tmpdir),
                                 format=file_format,
                                 filename=data_filename)
    else:
        sample_df.ww.to_disk(path=str(tmpdir),
                             format=file_format,
                             filename=data_filename)
        assert os.path.isfile(os.path.join(tmpdir, "data", filename_to_check))
        deserialized_df = read_woodwork_table(
            path=str(tmpdir),
            filename=data_filename,
        )
        pd.testing.assert_frame_equal(
            to_pandas(sample_df, index=sample_df.ww.index, sort_index=True),
            to_pandas(deserialized_df,
                      index=deserialized_df.ww.index,
                      sort_index=True),
        )
        assert sample_df.ww.schema == deserialized_df.ww.schema
def test_serialize_s3_parquet(sample_df, s3_client, s3_bucket):
    xfail_tmp_disappears(sample_df)

    sample_df.ww.init()
    sample_df.ww.to_parquet(TEST_S3_URL)
    make_public(s3_client, s3_bucket)
    deserialized_df = deserialize.read_woodwork_table(TEST_S3_URL)

    pd.testing.assert_frame_equal(
        to_pandas(sample_df, index=sample_df.ww.index, sort_index=True),
        to_pandas(deserialized_df,
                  index=deserialized_df.ww.index,
                  sort_index=True))
    assert sample_df.ww.schema == deserialized_df.ww.schema
def test_s3_test_profile(sample_df, s3_client, s3_bucket, setup_test_profile):
    xfail_tmp_disappears(sample_df)
    sample_df.ww.init()
    sample_df.ww.to_csv(TEST_S3_URL,
                        encoding='utf-8',
                        engine='python',
                        profile_name='test')
    make_public(s3_client, s3_bucket)
    deserialized_df = deserialize.read_woodwork_table(TEST_S3_URL,
                                                      profile_name='test')

    pd.testing.assert_frame_equal(
        to_pandas(sample_df, index=sample_df.ww.index),
        to_pandas(deserialized_df, index=deserialized_df.ww.index))
    assert sample_df.ww.schema == deserialized_df.ww.schema
def test_to_pickle(sample_df, tmpdir):
    sample_df.ww.init()
    if not isinstance(sample_df, pd.DataFrame):
        msg = 'DataFrame type not compatible with pickle serialization. Please serialize to another format.'
        with pytest.raises(ValueError, match=msg):
            sample_df.ww.to_pickle(str(tmpdir))
    else:
        sample_df.ww.to_pickle(str(tmpdir))
        deserialized_df = deserialize.read_woodwork_table(str(tmpdir))

        pd.testing.assert_frame_equal(
            to_pandas(deserialized_df,
                      index=deserialized_df.ww.index,
                      sort_index=True),
            to_pandas(sample_df, index=sample_df.ww.index, sort_index=True))
        assert deserialized_df.ww.schema == sample_df.ww.schema
def test_to_csv_S3(sample_df, s3_client, s3_bucket):
    xfail_tmp_disappears(sample_df)

    sample_df.ww.init(name='test_data',
                      index='id',
                      semantic_tags={'id': 'tag1'},
                      logical_types={'age': Ordinal(order=[25, 33, 57])})
    sample_df.ww.to_csv(TEST_S3_URL, encoding='utf-8', engine='python')
    make_public(s3_client, s3_bucket)

    deserialized_df = deserialize.read_woodwork_table(TEST_S3_URL)

    pd.testing.assert_frame_equal(
        to_pandas(sample_df, index=sample_df.ww.index, sort_index=True),
        to_pandas(deserialized_df,
                  index=deserialized_df.ww.index,
                  sort_index=True))
    assert sample_df.ww.schema == deserialized_df.ww.schema
def test_to_pickle_with_latlong(latlong_df, tmpdir):
    latlong_df.ww.init(
        logical_types={col: 'LatLong'
                       for col in latlong_df.columns})
    if not isinstance(latlong_df, pd.DataFrame):
        msg = 'DataFrame type not compatible with pickle serialization. Please serialize to another format.'
        with pytest.raises(ValueError, match=msg):
            latlong_df.ww.to_pickle(str(tmpdir))
    else:
        latlong_df.ww.to_pickle(str(tmpdir))
        deserialized_df = deserialize.read_woodwork_table(str(tmpdir))

        pd.testing.assert_frame_equal(
            to_pandas(latlong_df, index=latlong_df.ww.index, sort_index=True),
            to_pandas(deserialized_df,
                      index=deserialized_df.ww.index,
                      sort_index=True))
        assert latlong_df.ww.schema == deserialized_df.ww.schema
Beispiel #23
0
def test_s3_test_profile(sample_df, s3_client, s3_bucket, setup_test_profile):
    xfail_tmp_disappears(sample_df)
    sample_df.ww.init()
    sample_df.ww.to_disk(
        TEST_S3_URL,
        format="csv",
        encoding="utf-8",
        engine="python",
        profile_name="test",
    )
    make_public(s3_client, s3_bucket)
    deserialized_df = read_woodwork_table(TEST_S3_URL, profile_name="test")

    pd.testing.assert_frame_equal(
        to_pandas(sample_df, index=sample_df.ww.index),
        to_pandas(deserialized_df, index=deserialized_df.ww.index),
    )
    assert sample_df.ww.schema == deserialized_df.ww.schema
Beispiel #24
0
def test_serialize_s3_parquet(sample_df, s3_client, s3_bucket, profile_name):
    xfail_tmp_disappears(sample_df)

    sample_df.ww.init()
    sample_df.ww.to_disk(TEST_S3_URL,
                         format="parquet",
                         profile_name=profile_name)
    make_public(s3_client, s3_bucket)
    deserialized_df = read_woodwork_table(TEST_S3_URL,
                                          filename="data.parquet",
                                          profile_name=profile_name)

    pd.testing.assert_frame_equal(
        to_pandas(sample_df, index=sample_df.ww.index, sort_index=True),
        to_pandas(deserialized_df,
                  index=deserialized_df.ww.index,
                  sort_index=True),
    )
    assert sample_df.ww.schema == deserialized_df.ww.schema
Beispiel #25
0
def test_to_disk_with_latlong(latlong_df, tmpdir, file_format):
    if file_format in ("arrow",
                       "feather") and not isinstance(latlong_df, pd.DataFrame):
        pytest.xfail(
            "Arrow IPC format (Feather) not supported on Dask or Spark")

    latlong_df.ww.init(
        logical_types={col: "LatLong"
                       for col in latlong_df.columns})

    error_msg = None
    if file_format == "orc" and _is_dask_dataframe(latlong_df):
        error_msg = "DataFrame type not compatible with orc serialization. Please serialize to another format."
        error_type = ValueError
    elif file_format == "pickle" and not isinstance(latlong_df, pd.DataFrame):
        error_msg = "DataFrame type not compatible with pickle serialization. Please serialize to another format."
        error_type = ValueError

    if error_msg:
        with pytest.raises(error_type, match=error_msg):
            latlong_df.ww.to_disk(str(tmpdir), format=file_format)
    else:
        latlong_df.ww.to_disk(str(tmpdir), format=file_format)
        filename = None
        format = None
        if file_format == "parquet":
            if _is_dask_dataframe(latlong_df) or _is_spark_dataframe(
                    latlong_df):
                format = "parquet"
            else:
                filename = "data.parquet"

        deserialized_df = read_woodwork_table(str(tmpdir),
                                              filename=filename,
                                              format=format)

        pd.testing.assert_frame_equal(
            to_pandas(latlong_df, index=latlong_df.ww.index, sort_index=True),
            to_pandas(deserialized_df,
                      index=deserialized_df.ww.index,
                      sort_index=True),
        )
        assert latlong_df.ww.schema == deserialized_df.ww.schema
Beispiel #26
0
def test_serialize_s3_pickle(sample_df_pandas, s3_client, s3_bucket,
                             profile_name):
    sample_df_pandas.ww.init()
    sample_df_pandas.ww.to_disk(TEST_S3_URL,
                                format="pickle",
                                profile_name=profile_name)
    make_public(s3_client, s3_bucket)
    deserialized_df = read_woodwork_table(TEST_S3_URL,
                                          profile_name=profile_name)

    pd.testing.assert_frame_equal(
        to_pandas(sample_df_pandas,
                  index=sample_df_pandas.ww.index,
                  sort_index=True),
        to_pandas(deserialized_df,
                  index=deserialized_df.ww.index,
                  sort_index=True),
    )
    assert sample_df_pandas.ww.schema == deserialized_df.ww.schema
Beispiel #27
0
def test_to_disk_parquet_typing_info_file_is_none(sample_df, tmpdir):
    sample_df.ww.init(index="id")
    sample_df.ww.to_disk(str(tmpdir), format="parquet")

    filename = "data.parquet"
    format = None
    if _is_dask_dataframe(sample_df) or _is_spark_dataframe(sample_df):
        filename = None
        format = "parquet"
    deserialized_df = read_woodwork_table(
        str(tmpdir),
        filename=filename,
        typing_info_filename=None,
        format=format,
    )
    pd.testing.assert_frame_equal(
        to_pandas(sample_df, index=sample_df.ww.index, sort_index=True),
        to_pandas(deserialized_df,
                  index=deserialized_df.ww.index,
                  sort_index=True),
    )
Beispiel #28
0
def test_to_csv(sample_df, tmpdir):
    if _is_dask_dataframe(sample_df):
        # Dask errors with pd.NA in some partitions, but not others
        sample_df["age"] = sample_df["age"].fillna(25)
    sample_df.ww.init(
        name="test_data",
        index="id",
        semantic_tags={"id": "tag1"},
        logical_types={"age": Ordinal(order=[25, 33, 57])},
        column_descriptions={
            "signup_date": "original signup date",
            "age": "age of the user",
        },
        column_origins={
            "phone_number": "base",
            "age": "base",
            "signup_date": "engineered",
        },
        column_metadata={
            "id": {
                "is_sorted": True
            },
            "age": {
                "interesting_values": [33, 57]
            },
        },
    )
    sample_df.ww.to_disk(str(tmpdir),
                         format="csv",
                         encoding="utf-8",
                         engine="python")
    deserialized_df = read_woodwork_table(str(tmpdir))

    pd.testing.assert_frame_equal(
        to_pandas(deserialized_df,
                  index=deserialized_df.ww.index,
                  sort_index=True),
        to_pandas(sample_df, index=sample_df.ww.index, sort_index=True),
    )
    assert deserialized_df.ww.schema == sample_df.ww.schema
Beispiel #29
0
def description_to_entityset(description, **kwargs):
    """Deserialize entityset from data description.

    Args:
        description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description`
        kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method.

    Returns:
        entityset (EntitySet) : Instance of :class:`.EntitySet`.
    """
    check_schema_version(description, "entityset")

    from featuretools.entityset import EntitySet

    # If data description was not read from disk, path is None.
    path = description.get("path")
    entityset = EntitySet(description["id"])

    for df in description["dataframes"].values():
        if path is not None:
            data_path = os.path.join(path, "data", df["name"])
            format = description.get("format")
            if format is not None:
                kwargs["format"] = format
                if format == "parquet" and df["loading_info"][
                        "table_type"] == "pandas":
                    kwargs["filename"] = df["name"] + ".parquet"
            dataframe = read_woodwork_table(data_path,
                                            validate=False,
                                            **kwargs)
        else:
            dataframe = empty_dataframe(df)

        entityset.add_dataframe(dataframe)

    for relationship in description["relationships"]:
        rel = Relationship.from_dictionary(relationship, entityset)
        entityset.add_relationship(relationship=rel)

    return entityset
def test_deserialize_validation_control(mock_validate_accessor_params):
    assert not mock_validate_accessor_params.called
    deserialize.read_woodwork_table(URL)
    assert not mock_validate_accessor_params.called
    deserialize.read_woodwork_table(URL, validate=True)
    assert mock_validate_accessor_params.called