Exemple #1
0
def test_to_disk_custom_data_subdirectory(sample_df, tmpdir, file_format,
                                          data_subdirectory):
    if file_format in ("arrow",
                       "feather") and not isinstance(sample_df, pd.DataFrame):
        pytest.xfail(
            "Arrow IPC format (Feather) not supported on Dask or Spark")

    sample_df.ww.init(index="id")
    error_msg = None
    if file_format == "orc" and _is_dask_dataframe(sample_df):
        error_msg = "DataFrame type not compatible with orc serialization. Please serialize to another format."
        error_type = ValueError
    elif file_format == "pickle" and not isinstance(sample_df, pd.DataFrame):
        error_msg = "DataFrame type not compatible with pickle serialization. Please serialize to another format."
        error_type = ValueError

    if error_msg:
        with pytest.raises(error_type, match=error_msg):
            sample_df.ww.to_disk(str(tmpdir),
                                 format=file_format,
                                 data_subdirectory=data_subdirectory)
        shutil.rmtree(str(tmpdir))

    else:
        sample_df.ww.to_disk(str(tmpdir),
                             format=file_format,
                             data_subdirectory=data_subdirectory)
        if data_subdirectory:
            assert os.path.exists(os.path.join(tmpdir, data_subdirectory))
        filename = None
        format = None
        if file_format == "parquet":
            if _is_dask_dataframe(sample_df) or _is_spark_dataframe(sample_df):
                format = "parquet"
            else:
                filename = "data.parquet"

        deserialized_df = read_woodwork_table(
            str(tmpdir),
            filename=filename,
            data_subdirectory=data_subdirectory,
            format=format,
        )
        pd.testing.assert_frame_equal(
            to_pandas(sample_df, index=sample_df.ww.index, sort_index=True),
            to_pandas(deserialized_df,
                      index=deserialized_df.ww.index,
                      sort_index=True),
        )
        assert sample_df.ww.schema == deserialized_df.ww.schema
        shutil.rmtree(str(tmpdir))
Exemple #2
0
def test_to_disk_custom_data_filename(sample_df, tmpdir, file_format):
    if file_format in ("arrow",
                       "feather") and not isinstance(sample_df, pd.DataFrame):
        pytest.xfail(
            "Arrow IPC format (Feather) not supported on Dask or Spark")

    sample_df.ww.init(index="id")
    error_msg = None
    if file_format == "orc" and _is_dask_dataframe(sample_df):
        error_msg = "DataFrame type not compatible with orc serialization. Please serialize to another format."
        error_type = ValueError
    elif file_format == "pickle" and not isinstance(sample_df, pd.DataFrame):
        error_msg = "DataFrame type not compatible with pickle serialization. Please serialize to another format."
        error_type = ValueError
    elif file_format == "parquet" and _is_dask_dataframe(sample_df):
        error_msg = "Writing a Dask dataframe to parquet with a filename specified is not supported"
        error_type = ValueError
    elif file_format == "csv" and _is_spark_dataframe(sample_df):
        error_msg = "Writing a Spark dataframe to csv with a filename specified is not supported"
        error_type = ValueError
    elif file_format == "parquet" and _is_spark_dataframe(sample_df):
        error_msg = "Writing a Spark dataframe to parquet with a filename specified is not supported"
        error_type = ValueError

    data_filename = f"custom_data.{file_format}"
    filename_to_check = data_filename
    if _is_dask_dataframe(sample_df):
        data_filename = f"custom_data-*.{file_format}"
        filename_to_check = f"custom_data-0.{file_format}"

    if error_msg:
        with pytest.raises(error_type, match=error_msg):
            sample_df.ww.to_disk(path=str(tmpdir),
                                 format=file_format,
                                 filename=data_filename)
    else:
        sample_df.ww.to_disk(path=str(tmpdir),
                             format=file_format,
                             filename=data_filename)
        assert os.path.isfile(os.path.join(tmpdir, "data", filename_to_check))
        deserialized_df = read_woodwork_table(
            path=str(tmpdir),
            filename=data_filename,
        )
        pd.testing.assert_frame_equal(
            to_pandas(sample_df, index=sample_df.ww.index, sort_index=True),
            to_pandas(deserialized_df,
                      index=deserialized_df.ww.index,
                      sort_index=True),
        )
        assert sample_df.ww.schema == deserialized_df.ww.schema
Exemple #3
0
    def _save_parquet_table_to_disk(self):
        """Writes data to disk with the updated metadata including WW typing info."""
        if _is_dask_dataframe(self.dataframe):
            path, dataframe = self._setup_for_dask_and_spark()
            dataframe.to_parquet(path, custom_metadata=self.metadata)
        elif _is_spark_dataframe(self.dataframe):
            path, dataframe = self._setup_for_dask_and_spark()
            dataframe.to_parquet(path)
            files = os.listdir(path)

            # Update first parquet file to save WW metadata
            parquet_files = sorted([f for f in files if Path(f).suffix == ".parquet"])
            update_file = os.path.join(path, parquet_files[0])
            table = pq.read_table(update_file)
            table_metadata = table.schema.metadata
            combined_meta = {
                **self.metadata,
                **table_metadata,
            }
            table = table.replace_schema_metadata(combined_meta)
            pq.write_table(table, update_file)

            # Remove checksum files which prevent deserialization if present due to updated parquet header
            crc_files = [f for f in files if Path(f).suffix == ".crc"]
            for file in crc_files:
                os.remove(os.path.join(path, file))
        else:
            file = self._get_filename()
            self.table = self.table.replace_schema_metadata(self.metadata)
            pq.write_table(self.table, file)
Exemple #4
0
def test_iloc_with_properties(sample_df):
    if _is_dask_dataframe(sample_df):
        pytest.xfail("iloc is not supported with Dask inputs")
    semantic_tags = {
        "full_name": "tag1",
        "email": ["tag2"],
        "phone_number": ["tag3", "tag2"],
        "signup_date": {"secondary_time_index"},
    }
    logical_types = {
        "full_name": Categorical,
        "email": EmailAddress,
        "phone_number": PhoneNumber,
        "age": Double,
    }
    df = sample_df.copy()
    df.ww.init(logical_types=logical_types, semantic_tags=semantic_tags)
    sliced = df.ww.iloc[1:3, 1:3]
    assert sliced.shape == (2, 2)
    assert sliced.ww.semantic_tags == {
        "full_name": {"category", "tag1"},
        "email": {"tag2"},
    }
    assert isinstance(sliced.ww.logical_types["full_name"], Categorical)
    assert isinstance(sliced.ww.logical_types["email"], EmailAddress)
    assert sliced.ww.index is None

    df = sample_df.copy()
    df.ww.init(logical_types=logical_types, use_standard_tags=False)
    sliced = df.ww.iloc[:, [0, 5]]
    assert sliced.ww.semantic_tags == {"id": set(), "signup_date": set()}
    assert isinstance(sliced.ww.logical_types["id"], Integer)
    assert isinstance(sliced.ww.logical_types["signup_date"], Datetime)
    assert sliced.ww.index is None
Exemple #5
0
def test_to_disk_parquet_no_file_extension(sample_df, tmpdir):
    if _is_dask_dataframe(sample_df) or _is_spark_dataframe(sample_df):
        pytest.skip(
            "Specifying filename for writing Dask or Spark DataFrames to parquet is not supported."
        )
    sample_df.ww.init(index="id")
    sample_df.ww.to_disk(str(tmpdir),
                         filename="parquet_data",
                         format="parquet")

    error_msg = "Could not determine format. Please specify filename and/or format."
    # Without specifying format, WW doens't know what type of file this is
    with pytest.raises(ValueError, match=error_msg):
        deserialized_df = read_woodwork_table(
            str(tmpdir),
            filename="parquet_data",
        )

    deserialized_df = read_woodwork_table(
        str(tmpdir),
        filename="parquet_data",
        format="parquet",
    )
    pd.testing.assert_frame_equal(
        to_pandas(sample_df, index=sample_df.ww.index, sort_index=True),
        to_pandas(deserialized_df,
                  index=deserialized_df.ww.index,
                  sort_index=True),
    )
Exemple #6
0
def test_iloc_dimensionality(sample_df):
    if _is_dask_dataframe(sample_df):
        pytest.xfail("iloc is not supported with Dask inputs")
    semantic_tags = {
        "full_name": "tag1",
        "email": ["tag2"],
        "phone_number": ["tag3", "tag2"],
        "signup_date": {"secondary_time_index"},
    }
    logical_types = {
        "full_name": Categorical,
        "email": EmailAddress,
        "phone_number": PhoneNumber,
        "age": Double,
    }
    sample_df.ww.init(logical_types=logical_types, semantic_tags=semantic_tags)

    sliced_series_row = sample_df.ww.iloc[1]
    assert isinstance(sliced_series_row, pd.Series)
    assert set(sliced_series_row.index) == set(sample_df.columns)
    assert sliced_series_row.name == 1

    sliced_series_col = sample_df.ww.iloc[:, 1]
    assert isinstance(sliced_series_col.ww.logical_type, Categorical)
    assert sliced_series_col.ww.semantic_tags == {"tag1", "category"}
    assert sliced_series_col.ww.name == "full_name"
Exemple #7
0
def test_overwrite_error(sample_df, tmpdir, format):
    if format == "pickle" and (_is_dask_dataframe(sample_df)
                               or _is_spark_dataframe(sample_df)):
        pytest.skip("Cannot pickle dask and spark dataframes")

    folder_1 = str(tmpdir.join("folder_1"))
    folder_2 = str(tmpdir.join("folder_2"))
    sample_df.ww.init()

    if format != "parquet":
        # Parquet does not use typing info file
        sample_df.ww.to_disk(folder_1, data_subdirectory=None, format=format)
        with pytest.raises(WoodworkFileExistsError,
                           match="Typing info already exists"):
            sample_df.ww.to_disk(folder_1, format=format)

    sample_df.ww.to_disk(folder_2, data_subdirectory=None, format=format)
    with pytest.raises(WoodworkFileExistsError,
                       match="Data file already exists"):
        sample_df.ww.to_disk(
            folder_2,
            format=format,
            typing_info_filename="new_typing_info",
            data_subdirectory=None,
        )

    shutil.rmtree(str(tmpdir))
Exemple #8
0
def test_to_disk_with_whitespace(whitespace_df, tmpdir, format):
    df = whitespace_df.copy()
    df.ww.init(index="id", logical_types={"comments": "NaturalLanguage"})
    if format == "pickle" and not isinstance(df, pd.DataFrame):
        msg = "DataFrame type not compatible with pickle serialization. Please serialize to another format."
        with pytest.raises(ValueError, match=msg):
            df.ww.to_disk(str(tmpdir), format="pickle")
    else:
        df.ww.to_disk(str(tmpdir), format=format)
        if format == "parquet":
            filename = "data.parquet"
            format = None
            if _is_dask_dataframe(whitespace_df) or _is_spark_dataframe(
                    whitespace_df):
                filename = None
                format = "parquet"
            deserialized_df = read_woodwork_table(
                path=str(tmpdir),
                filename=filename,
                format=format,
            )
        else:
            deserialized_df = read_woodwork_table(str(tmpdir))
        assert deserialized_df.ww.schema == df.ww.schema
        pd.testing.assert_frame_equal(
            to_pandas(deserialized_df,
                      index=deserialized_df.ww.index,
                      sort_index=True),
            to_pandas(df, index=df.ww.index, sort_index=True),
        )
Exemple #9
0
 def serialize(self, dataframe, profile_name, **kwargs):
     import_or_raise("pyarrow", PYARROW_IMPORT_ERROR_MESSAGE)
     # Serialization to orc relies on pyarrow.Table.from_pandas which doesn't work with Dask
     if _is_dask_dataframe(dataframe):
         msg = "DataFrame type not compatible with orc serialization. Please serialize to another format."
         raise ValueError(msg)
     self.kwargs["engine"] = "pyarrow"
     return super().serialize(dataframe, profile_name, **kwargs)
Exemple #10
0
def test_iLocIndexer_class(sample_df):
    if _is_dask_dataframe(sample_df):
        pytest.xfail("iloc is not supported with Dask inputs")
    sample_df.ww.init()
    ind = _iLocIndexer(sample_df)
    pd.testing.assert_frame_equal(to_pandas(ind.data), to_pandas(sample_df))
    pd.testing.assert_frame_equal(to_pandas(ind[1:2]),
                                  to_pandas(sample_df.iloc[1:2]))
    assert ind[0, 0] == 0
Exemple #11
0
def test_iloc_indices_column(sample_df):
    if _is_dask_dataframe(sample_df):
        pytest.xfail("iloc is not supported with Dask inputs")
    sample_df.ww.init(index="id", time_index="signup_date")
    sliced_index = sample_df.ww.iloc[:, 0]
    assert sliced_index.ww.semantic_tags == {"index"}

    sliced_time_index = sample_df.ww.iloc[:, 5]
    assert sliced_time_index.ww.semantic_tags == {"time_index"}
Exemple #12
0
def test_to_disk_with_latlong(latlong_df, tmpdir, file_format):
    if file_format in ("arrow",
                       "feather") and not isinstance(latlong_df, pd.DataFrame):
        pytest.xfail(
            "Arrow IPC format (Feather) not supported on Dask or Spark")

    latlong_df.ww.init(
        logical_types={col: "LatLong"
                       for col in latlong_df.columns})

    error_msg = None
    if file_format == "orc" and _is_dask_dataframe(latlong_df):
        error_msg = "DataFrame type not compatible with orc serialization. Please serialize to another format."
        error_type = ValueError
    elif file_format == "pickle" and not isinstance(latlong_df, pd.DataFrame):
        error_msg = "DataFrame type not compatible with pickle serialization. Please serialize to another format."
        error_type = ValueError

    if error_msg:
        with pytest.raises(error_type, match=error_msg):
            latlong_df.ww.to_disk(str(tmpdir), format=file_format)
    else:
        latlong_df.ww.to_disk(str(tmpdir), format=file_format)
        filename = None
        format = None
        if file_format == "parquet":
            if _is_dask_dataframe(latlong_df) or _is_spark_dataframe(
                    latlong_df):
                format = "parquet"
            else:
                filename = "data.parquet"

        deserialized_df = read_woodwork_table(str(tmpdir),
                                              filename=filename,
                                              format=format)

        pd.testing.assert_frame_equal(
            to_pandas(latlong_df, index=latlong_df.ww.index, sort_index=True),
            to_pandas(deserialized_df,
                      index=deserialized_df.ww.index,
                      sort_index=True),
        )
        assert latlong_df.ww.schema == deserialized_df.ww.schema
Exemple #13
0
def test_iloc_indices(sample_df):
    if _is_dask_dataframe(sample_df):
        pytest.xfail("iloc is not supported with Dask inputs")
    df_with_index = sample_df.copy()
    df_with_index.ww.init(index="id")
    assert df_with_index.ww.iloc[:, [0, 5]].ww.index == "id"
    assert df_with_index.ww.iloc[:, [1, 2]].ww.index is None

    df_with_time_index = sample_df.copy()
    df_with_time_index.ww.init(time_index="signup_date")
    assert df_with_time_index.ww.iloc[:, [0, 5]].ww.time_index == "signup_date"
    assert df_with_time_index.ww.iloc[:, [1, 2]].ww.index is None
Exemple #14
0
 def serialize(self, dataframe, profile_name, **kwargs):
     import_or_raise("pyarrow", PYARROW_IMPORT_ERROR_MESSAGE)
     if self.filename is not None and _is_dask_dataframe(dataframe):
         raise ValueError(
             "Writing a Dask dataframe to parquet with a filename specified is not supported"
         )
     if self.filename is not None and _is_spark_dataframe(dataframe):
         raise ValueError(
             "Writing a Spark dataframe to parquet with a filename specified is not supported"
         )
     self.kwargs["engine"] = "pyarrow"
     return super().serialize(dataframe, profile_name, **kwargs)
Exemple #15
0
def _get_value_counts(dataframe, ascending=False, top_n=10, dropna=False):
    """Returns a list of dictionaries with counts for the most frequent values in each column (only
        for columns with `category` as a standard tag).


    Args:
        dataframe (pd.DataFrame, dd.DataFrame, ps.DataFrame): Data from which to count values.
        ascending (bool): Defines whether each list of values should be sorted most frequent
            to least frequent value (False), or least frequent to most frequent value (True).
            Defaults to False.

        top_n (int): the number of top values to retrieve. Defaults to 10.

        dropna (bool): determines whether to remove NaN values when finding frequency. Defaults
            to False.

    Returns:
        list(dict): a list of dictionaries for each categorical column with keys `count`
        and `value`.
    """
    val_counts = {}
    valid_cols = [
        col for col, column in dataframe.ww.columns.items()
        if column.is_categorical
    ]
    data = dataframe[valid_cols]
    is_ks = False
    if _is_dask_dataframe(data):
        data = data.compute()
    if _is_spark_dataframe(data):
        data = data.to_pandas()
        is_ks = True

    for col in valid_cols:
        if dropna and is_ks:
            # Spark categorical columns will have missing values replaced with the string 'None'
            # Replace them with np.nan so dropna work
            datacol = data[col].replace(to_replace="None", value=np.nan)
        else:
            datacol = data[col]
        frequencies = datacol.value_counts(ascending=ascending, dropna=dropna)
        df = frequencies[:top_n].reset_index()
        df.columns = ["value", "count"]
        values = list(df.to_dict(orient="index").values())
        val_counts[col] = values
    return val_counts
Exemple #16
0
def test_iloc_table_does_not_propagate_changes_to_data(sample_df):
    if _is_dask_dataframe(sample_df):
        pytest.xfail("iloc is not supported with Dask inputs")
    sample_df.ww.init()
    sliced = sample_df.ww.iloc[1:3, 1:3]

    sample_df.ww.add_semantic_tags({"full_name": "new_tag"})
    assert sliced.ww.semantic_tags["full_name"] == set()
    assert (sliced.ww.semantic_tags["full_name"]
            is not sample_df.ww.semantic_tags["full_name"])

    sample_df.ww.metadata["new_key"] = "new_value"
    assert sliced.ww.metadata == {}
    assert sliced.ww.metadata is not sample_df.ww.metadata

    sample_df.ww.columns["email"].metadata["new_key"] = "new_value"
    assert sliced.ww.columns["email"].metadata == {}
    assert (sliced.ww.columns["email"].metadata
            is not sample_df.ww.columns["email"].metadata)
Exemple #17
0
def test_to_disk_parquet_typing_info_file_is_none(sample_df, tmpdir):
    sample_df.ww.init(index="id")
    sample_df.ww.to_disk(str(tmpdir), format="parquet")

    filename = "data.parquet"
    format = None
    if _is_dask_dataframe(sample_df) or _is_spark_dataframe(sample_df):
        filename = None
        format = "parquet"
    deserialized_df = read_woodwork_table(
        str(tmpdir),
        filename=filename,
        typing_info_filename=None,
        format=format,
    )
    pd.testing.assert_frame_equal(
        to_pandas(sample_df, index=sample_df.ww.index, sort_index=True),
        to_pandas(deserialized_df,
                  index=deserialized_df.ww.index,
                  sort_index=True),
    )
Exemple #18
0
def test_to_csv(sample_df, tmpdir):
    if _is_dask_dataframe(sample_df):
        # Dask errors with pd.NA in some partitions, but not others
        sample_df["age"] = sample_df["age"].fillna(25)
    sample_df.ww.init(
        name="test_data",
        index="id",
        semantic_tags={"id": "tag1"},
        logical_types={"age": Ordinal(order=[25, 33, 57])},
        column_descriptions={
            "signup_date": "original signup date",
            "age": "age of the user",
        },
        column_origins={
            "phone_number": "base",
            "age": "base",
            "signup_date": "engineered",
        },
        column_metadata={
            "id": {
                "is_sorted": True
            },
            "age": {
                "interesting_values": [33, 57]
            },
        },
    )
    sample_df.ww.to_disk(str(tmpdir),
                         format="csv",
                         encoding="utf-8",
                         engine="python")
    deserialized_df = read_woodwork_table(str(tmpdir))

    pd.testing.assert_frame_equal(
        to_pandas(deserialized_df,
                  index=deserialized_df.ww.index,
                  sort_index=True),
        to_pandas(sample_df, index=sample_df.ww.index, sort_index=True),
    )
    assert deserialized_df.ww.schema == sample_df.ww.schema
Exemple #19
0
def test_to_disk_parquet_saves_custom_metadata_as_expected(sample_df, tmpdir):
    sample_df.ww.init(index="id")
    sample_df.ww.set_types(logical_types={"categorical": "CountryCode"},
                           semantic_tags={"age": "age"})
    sample_df.ww.to_disk(str(tmpdir), format="parquet")
    expected_typing_info = typing_info_to_dict(sample_df)

    if _is_dask_dataframe(sample_df):
        filename = "part.0.parquet"
        path = os.path.join(tmpdir, "data", filename)
    elif _is_spark_dataframe(sample_df):
        path = os.path.join(tmpdir, "data")
        files = os.listdir(path)
        metadata_file = sorted(
            [f for f in files if Path(f).suffix == ".parquet"])[0]
        path = os.path.join(path, metadata_file)
    else:
        filename = "data.parquet"
        path = os.path.join(tmpdir, "data", filename)

    file_metadata = pa.parquet.read_metadata(path).metadata
    assert b"ww_meta" in file_metadata.keys()
    ww_meta = json.loads(file_metadata[b"ww_meta"])
    columns = ww_meta["column_typing_info"]
    cat_info = list(filter(lambda col: col["name"] == "categorical",
                           columns))[0]
    age_info = list(filter(lambda col: col["name"] == "age", columns))[0]

    assert cat_info["logical_type"]["type"] == "CountryCode"
    assert "age" in age_info["semantic_tags"]

    # location, type and params are added during serialization, so they are not present
    # in the expected typing information created from the Woodwork dataframe.
    del ww_meta["loading_info"]["location"]
    del ww_meta["loading_info"]["type"]
    del ww_meta["loading_info"]["params"]

    assert ww_meta == expected_typing_info
Exemple #20
0
 def _generate_parquet_metadata(self):
     """Generate metadata for the parquet file header. For pandas this includes additional
     information needed by pandas. For Dask/Spark, this includes only the Woodwork typing info."""
     loading_info = {
         "location": self.location,
         "type": self.format,
         "params": self.kwargs,
     }
     self.typing_info["loading_info"].update(loading_info)
     # For Dask and Spark we only get the WW metadata because we haven't created
     # the pyarrow table yet, but for pandas we combine the existing parquet
     # metadata with the WW metadata.
     if _is_dask_dataframe(self.dataframe) or _is_spark_dataframe(self.dataframe):
         metadata = {
             "ww_meta".encode(): json.dumps(self.typing_info).encode(),
         }
     else:
         table_metadata = self.table.schema.metadata
         metadata = {
             "ww_meta".encode(): json.dumps(self.typing_info).encode(),
             **table_metadata,
         }
     self.metadata = metadata
Exemple #21
0
def test_to_disk_custom_typing_filename(sample_df, tmpdir, file_format):
    if file_format in ("arrow",
                       "feather") and not isinstance(sample_df, pd.DataFrame):
        pytest.xfail(
            "Arrow IPC format (Feather) not supported on Dask or Spark")

    sample_df.ww.init(index="id")
    error_msg = None
    if file_format == "orc" and _is_dask_dataframe(sample_df):
        error_msg = "DataFrame type not compatible with orc serialization. Please serialize to another format."
        error_type = ValueError
    elif file_format == "pickle" and not isinstance(sample_df, pd.DataFrame):
        error_msg = "DataFrame type not compatible with pickle serialization. Please serialize to another format."
        error_type = ValueError

    custom_typing_filename = "custom_typing_info.json"
    if error_msg:
        with pytest.raises(error_type, match=error_msg):
            sample_df.ww.to_disk(
                str(tmpdir),
                format=file_format,
                typing_info_filename=custom_typing_filename,
            )
    else:
        sample_df.ww.to_disk(str(tmpdir),
                             format=file_format,
                             typing_info_filename=custom_typing_filename)
        assert os.path.isfile(os.path.join(tmpdir, custom_typing_filename))
        deserialized_df = read_woodwork_table(
            str(tmpdir), typing_info_filename=custom_typing_filename)
        pd.testing.assert_frame_equal(
            to_pandas(sample_df, index=sample_df.ww.index, sort_index=True),
            to_pandas(deserialized_df,
                      index=deserialized_df.ww.index,
                      sort_index=True),
        )
        assert sample_df.ww.schema == deserialized_df.ww.schema
def _get_describe_dict(
    dataframe,
    include=None,
    callback=None,
    extra_stats=False,
    bins=10,
    top_x=10,
    recent_x=10,
):
    """Calculates statistics for data contained in a DataFrame using Woodwork typing information.

    Args:
        dataframe (pd.DataFrame): DataFrame to be described with Woodwork typing information initialized
        include (list[str or LogicalType], optional): filter for what columns to include in the
            statistics returned. Can be a list of column names, semantic tags, logical types, or a list
            combining any of the three. It follows the most broad specification. Favors logical types
            then semantic tag then column name. If no matching columns are found, an empty DataFrame
            will be returned.
        callback (callable, optional): function to be called with incremental updates. Has the following parameters:

            - update (int): change in progress since last call
            - progress (int): the progress so far in the calculations
            - total (int): the total number of calculations to do
            - unit (str): unit of measurement for progress/total
            - time_elapsed (float): total time in seconds elapsed since start of call

        extra_stats (bool): If True, will calculate a histogram for numeric columns, top values
            for categorical columns and value counts for the most recent values in datetime columns. Will also
            calculate value counts within the range of values present for integer columns if the range of
            values present is less than or equal to than the number of bins used to compute the histogram.
            Output can be controlled by bins, top_x and recent_x parameters.
        bins (int): Number of bins to use when calculating histogram for numeric columns. Defaults to 10.
            Will be ignored unless extra_stats=True.
        top_x (int): Number of items to return when getting the most frequently occurring values for categorical
            columns. Defaults to 10. Will be ignored unless extra_stats=True.
        recent_x (int): Number of values to return when calculating value counts for the most recent dates in
            datetime columns. Defaults to 10. Will be ignored unless extra_stats=True.

    Returns:
        dict[str -> dict]: A dictionary with a key for each column in the data or for each column
        matching the logical types, semantic tags or column names specified in ``include``, paired
        with a value containing a dictionary containing relevant statistics for that column.
    """
    start_time = timer()
    unit = "calculations"
    agg_stats_to_calculate = {
        "category": ["count", "nunique"],
        "numeric": ["count", "max", "min", "nunique", "mean", "std"],
        Datetime: ["count", "max", "min", "nunique", "mean"],
        Unknown: ["count", "nunique"],
    }
    if include is not None:
        filtered_cols = dataframe.ww._filter_cols(include, col_names=True)
        cols_to_include = [
            (k, v) for k, v in dataframe.ww.columns.items() if k in filtered_cols
        ]
    else:
        cols_to_include = dataframe.ww.columns.items()

    results = {}

    if _is_dask_dataframe(dataframe):
        df = dataframe.compute()
    elif _is_spark_dataframe(dataframe):
        df = dataframe.to_pandas()

        # Any LatLong columns will be using lists, which we must convert
        # back to tuples so we can calculate the mode, which requires hashable values
        latlong_columns = [
            col_name
            for col_name, col in dataframe.ww.columns.items()
            if type(col.logical_type) == LatLong
        ]
        df[latlong_columns] = df[latlong_columns].applymap(
            lambda latlong: tuple(latlong) if latlong else latlong
        )
    else:
        df = dataframe

    # Setup for progress callback and make initial call
    # Assume 1 unit for general preprocessing, plus main loop over column
    total_loops = 1 + len(cols_to_include)
    callback_caller = CallbackCaller(callback, unit, total_loops, start_time=start_time)
    callback_caller.update(1)

    for column_name, column in cols_to_include:
        if "index" in column.semantic_tags:
            callback_caller.update(1)
            continue
        values = {}
        logical_type = column.logical_type
        semantic_tags = column.semantic_tags
        series = df[column_name]

        # Calculate Aggregation Stats
        if column.is_categorical:
            agg_stats = agg_stats_to_calculate["category"]
        elif column.is_numeric:
            agg_stats = agg_stats_to_calculate["numeric"]
        elif column.is_datetime:
            agg_stats = agg_stats_to_calculate[Datetime]
        elif column.is_unknown:
            agg_stats = agg_stats_to_calculate[Unknown]
        else:
            agg_stats = ["count"]
        values = series.agg(agg_stats).to_dict()

        # Calculate other specific stats based on logical type or semantic tags
        if column.is_boolean:
            values["num_false"] = series.value_counts().get(False, 0)
            values["num_true"] = series.value_counts().get(True, 0)
        elif column.is_numeric:
            float_series = series.astype(
                "float64"
            )  # workaround for https://github.com/pandas-dev/pandas/issues/42626
            quant_values = float_series.quantile([0.25, 0.5, 0.75]).tolist()
            values["first_quartile"] = quant_values[0]
            values["second_quartile"] = quant_values[1]
            values["third_quartile"] = quant_values[2]

        mode = _get_mode(series)
        # The format of the mode should match its format in the DataFrame
        if _is_spark_dataframe(dataframe) and series.name in latlong_columns:
            mode = list(mode)

        if column.is_latlong:
            nan_count = series.apply(_is_latlong_nan).sum()
            count = len(series) - nan_count

            values["nan_count"] = nan_count
            values["count"] = count
        else:
            values["nan_count"] = series.isna().sum()

        values["mode"] = mode
        values["physical_type"] = series.dtype
        values["logical_type"] = logical_type
        values["semantic_tags"] = semantic_tags

        # Calculate extra detailed stats, if requested
        if extra_stats:
            if column.is_numeric:
                if pd.isnull(values["max"]) or pd.isnull(values["min"]):
                    values["histogram"] = []
                    values["top_values"] = []
                else:
                    values["histogram"] = _get_histogram_values(series, bins=bins)
                    _range = range(int(values["min"]), int(values["max"]) + 1)
                    # Calculate top numeric values if range of values present
                    # is less than or equal number of histogram bins and series
                    # contains only integer values
                    range_len = int(values["max"]) + 1 - int(values["min"])
                    if range_len <= bins and (series % 1 == 0).all():
                        values["top_values"] = _get_numeric_value_counts_in_range(
                            series, _range
                        )
            elif column.is_categorical:
                values["top_values"] = _get_top_values_categorical(series, top_x)
            elif column.is_datetime:
                values["recent_values"] = _get_recent_value_counts(series, recent_x)

        results[column_name] = values
        callback_caller.update(1)
    return results
Exemple #23
0
def typing_info_to_dict(dataframe):
    """Creates the description for a Woodwork table, including typing information for each column
    and loading information.

    Args:
        dataframe (pd.DataFrame, dd.Dataframe, ks.DataFrame): DataFrame with Woodwork typing
            information initialized.

    Returns:
        dict: Dictionary containing Woodwork typing information
    """
    if _is_dask_dataframe(dataframe):
        # Need to determine the category info for Dask it can be saved below
        category_cols = [
            colname for colname, col in dataframe.ww._schema.columns.items()
            if col.is_categorical
        ]
        dataframe = dataframe.ww.categorize(columns=category_cols)
    ordered_columns = dataframe.columns

    def _get_physical_type_dict(column):
        type_dict = {"type": str(column.dtype)}
        if str(column.dtype) == "category":
            type_dict["cat_values"] = column.dtype.categories.to_list()
            type_dict["cat_dtype"] = str(column.dtype.categories.dtype)
        return type_dict

    column_typing_info = [{
        "name":
        col_name,
        "ordinal":
        ordered_columns.get_loc(col_name),
        "use_standard_tags":
        col.use_standard_tags,
        "logical_type": {
            "parameters": _get_specified_ltype_params(col.logical_type),
            "type": str(_get_ltype_class(col.logical_type)),
        },
        "physical_type":
        _get_physical_type_dict(dataframe[col_name]),
        "semantic_tags":
        sorted(list(col.semantic_tags)),
        "description":
        col.description,
        "origin":
        col.origin,
        "metadata":
        col.metadata,
    } for col_name, col in dataframe.ww.columns.items()]

    if _is_dask_dataframe(dataframe):
        table_type = "dask"
    elif _is_spark_dataframe(dataframe):
        table_type = "spark"
    else:
        table_type = "pandas"

    return {
        "schema_version": SCHEMA_VERSION,
        "name": dataframe.ww.name,
        "index": dataframe.ww.index,
        "time_index": dataframe.ww.time_index,
        "column_typing_info": column_typing_info,
        "loading_info": {
            "table_type": table_type
        },
        "table_metadata": dataframe.ww.metadata,
    }
Exemple #24
0
def test_is_spark_dataframe(sample_df_spark):
    assert _is_spark_dataframe(sample_df_spark)
    assert not _is_dask_dataframe(pd.DataFrame())
Exemple #25
0
 def __init__(self, data):
     self.data = data
     if _is_dask_dataframe(data):
         raise TypeError("iloc is not supported for Dask DataFrames")
     elif _is_dask_series(data):
         raise TypeError("iloc is not supported for Dask Series")
Exemple #26
0
def _get_dependence_dict(
    dataframe,
    measures,
    num_bins=10,
    nrows=None,
    include_index=False,
    callback=None,
    extra_stats=False,
    min_shared=25,
    random_seed=0,
):
    """Calculates dependence measures between all pairs of columns in the DataFrame that
    support measuring dependence. Supports boolean, categorical, datetime, and numeric data.
    Call woodwork.utils.get_valid_mi_types and woodwork.utils.get_valid_pearson_types
    for complete lists of supported Logical Types.

    Args:
        dataframe (pd.DataFrame): Data containing Woodwork typing information
            from which to calculate dependence.
        measures (list or str): Which dependence measures to calculate.
            A list of measures can be provided to calculate multiple
            measures at once.  Valid measure strings:

                - "pearson": calculates the Pearson correlation coefficient
                - "mutual_info": calculates the mutual information between columns
                - "max":  max(abs(pearson), mutual) for each pair of columns
                - "all": includes columns for "pearson", "mutual_info", and "max"
        num_bins (int): Determines number of bins to use for converting numeric
            features into categorical.  Default to 10. Pearson calculation does
            not use binning.
        nrows (int): The number of rows to sample for when determining dependence.
            If specified, samples the desired number of rows from the data.
            Defaults to using all rows.
        include_index (bool): If True, the column specified as the index will be
            included as long as its LogicalType is valid for measuring dependence.
            If False, the index column will not be considered. Defaults to False.
        callback (callable, optional): function to be called with incremental updates. Has the following parameters:

            - update (int): change in progress since last call
            - progress (int): the progress so far in the calculations
            - total (int): the total number of calculations to do
            - unit (str): unit of measurement for progress/total
            - time_elapsed (float): total time in seconds elapsed since start of call
        extra_stats (bool):  If True, additional column "shared_rows"
            recording the number of shared non-null rows for a column
            pair will be included with the dataframe.  If the "max"
            measure is being used, a "measure_used" column will be added
            that records whether Pearson or mutual information was the
            maximum dependence for a particular row. Defaults to False.
        min_shared (int): The number of shared non-null rows needed to
            calculate.  Less rows than this will be considered too sparse
            to measure accurately and will return a NaN value. Must be
            non-negative. Defaults to 25.
        random_seed (int): Seed for the random number generator. Defaults to 0.
    Returns:
        list(dict): A list containing dictionaries that have keys `column_1`,
        `column_2`, and keys for the specified dependence measures. The list is
        sorted in decending order by the first specified measure.
        Dependence information values are between 0 (no dependence) and 1
        (perfect dependency). For Pearson, values range from -1 to 1 but 0 is
        still no dependence.
    """
    start_time = timer()

    returned_measures, calc_order, calc_max = _parse_measures(measures)

    unit = "calculations"

    # get valid columns for dependence calculations
    if "pearson" in calc_order:
        pearson_types = get_valid_pearson_types()
        pearson_columns = _get_valid_columns(dataframe, pearson_types)
        valid_columns = pearson_columns
    if "mutual_info" in calc_order:
        mi_types = get_valid_mi_types()
        mutual_columns = _get_valid_columns(dataframe, mi_types)
        # pearson columns are a subset of mutual columns
        valid_columns = mutual_columns

    index = dataframe.ww.index
    if not include_index and index is not None and index in valid_columns:
        valid_columns.remove(index)

    data = dataframe.loc[:, valid_columns]
    # cut off data if necessary
    if _is_dask_dataframe(data):
        data = data.compute()
    elif _is_spark_dataframe(dataframe):
        data = data.to_pandas()
    if nrows is not None and nrows < data.shape[0]:
        data = data.sample(nrows, random_state=random_seed)

    notna_mask = data.notnull()
    not_null_cols = data.columns[notna_mask.any()]
    not_null_col_set = set(not_null_cols)
    if not_null_col_set != set(valid_columns):
        data = data.loc[:, not_null_cols]

    p = 0  # number of pearson columns
    m = 0  # number of mutual columns
    if "pearson" in calc_order:
        pearson_columns = [
            col for col in pearson_columns if col in not_null_col_set
        ]
        p = len(pearson_columns)
    if "mutual_info" in calc_order:
        mutual_columns = [
            col for col in mutual_columns if col in not_null_col_set
        ]
        m = len(mutual_columns)
    n = max(m, p)

    # combinations in a loop is n! / 2 / (n - 2)! which reduces to (n) (n - 1) / 2
    def _num_calc_steps(n):
        return (n * n - n) / 2

    # Assume 1 unit for preprocessing, n for handling nulls, m for binning numerics
    total_loops = 1 + n + m + _num_calc_steps(p) + _num_calc_steps(m)
    callback_caller = CallbackCaller(callback,
                                     unit,
                                     total_loops,
                                     start_time=start_time)
    callback_caller.update(1)

    # split dataframe into dict of series so we can drop nulls on a per-column basis
    data = {col: data[col].dropna() for col in data}

    # cast nullable type to non-nullable (needed for both pearson and mutual)
    _cast_nullable_int_and_datetime_to_int(data, dataframe.ww.columns)
    callback_caller.update(n)

    results = defaultdict(dict)

    for measure in calc_order:
        if measure == "mutual_info":
            _bin_numeric_cols_into_categories(dataframe.ww.schema, data,
                                              num_bins)
            callback_caller.update(n)
            col_names = mutual_columns
        elif measure == "pearson":
            col_names = pearson_columns

        _calculate_dependence_measure(
            measure=measure,
            data=data,
            results=results,
            callback_caller=callback_caller,
            notna_mask=notna_mask,
            min_shared=min_shared,
            col_names=col_names,
        )

    for result in results.values():
        if calc_max:
            _calculate_max_dependence_for_pair(
                result=result,
                min_shared=min_shared,
                extra_stats=extra_stats,
            )
            if returned_measures == ["max"]:
                # remove measurements not expected in returned dictionary
                del result["mutual_info"]
                if "pearson" in result:
                    del result["pearson"]

        # Remove cached info not expected in result by user
        if "num_union" in result:
            del result["num_union"]
        if not extra_stats:
            del result["shared_rows"]

    results = list(results.values())

    def sort_key(result):
        key = abs(result[returned_measures[0]])
        if np.isnan(key):
            key = -1
        return key

    results.sort(key=sort_key, reverse=True)

    return results
Exemple #27
0
def test_to_dictionary(sample_df):
    if _is_dask_dataframe(sample_df):
        table_type = "dask"
        age_cat_type_dict = {
            "type": "category",
            "cat_values": [25, 33, 57],
            "cat_dtype": "int64",
        }
        cat_type_dict = {
            "type": "category",
            "cat_values": ["a", "b", "c"],
            "cat_dtype": "object",
        }
    elif _is_spark_dataframe(sample_df):
        table_type = "spark"
        age_cat_type_dict = {"type": "string"}
        cat_type_dict = {"type": "string"}
    else:
        table_type = "pandas"
        age_cat_type_dict = {
            "type": "category",
            "cat_values": [25, 33, 57],
            "cat_dtype": "int64",
        }
        cat_type_dict = {
            "type": "category",
            "cat_values": ["a", "b", "c"],
            "cat_dtype": "object",
        }

    int_val = "int64"
    nullable_int_val = "Int64"
    string_val = "string"
    bool_val = "boolean"
    double_val = "float64"

    expected = {
        "schema_version":
        SCHEMA_VERSION,
        "name":
        "test_data",
        "index":
        "id",
        "time_index":
        None,
        "column_typing_info": [
            {
                "name": "id",
                "ordinal": 0,
                "use_standard_tags": True,
                "logical_type": {
                    "parameters": {},
                    "type": "Integer"
                },
                "physical_type": {
                    "type": int_val
                },
                "semantic_tags": ["index", "tag1"],
                "description": None,
                "origin": None,
                "metadata": {
                    "is_sorted": True
                },
            },
            {
                "name": "full_name",
                "ordinal": 1,
                "use_standard_tags": True,
                "logical_type": {
                    "parameters": {},
                    "type": "Unknown"
                },
                "physical_type": {
                    "type": string_val
                },
                "semantic_tags": [],
                "description": None,
                "origin": None,
                "metadata": {},
            },
            {
                "name": "email",
                "ordinal": 2,
                "use_standard_tags": True,
                "logical_type": {
                    "parameters": {},
                    "type": "EmailAddress"
                },
                "physical_type": {
                    "type": string_val
                },
                "semantic_tags": [],
                "description": None,
                "origin": None,
                "metadata": {},
            },
            {
                "name": "phone_number",
                "ordinal": 3,
                "use_standard_tags": True,
                "logical_type": {
                    "parameters": {},
                    "type": "PhoneNumber"
                },
                "physical_type": {
                    "type": string_val
                },
                "semantic_tags": [],
                "description": None,
                "origin": "base",
                "metadata": {},
            },
            {
                "name": "age",
                "ordinal": 4,
                "use_standard_tags": True,
                "logical_type": {
                    "parameters": {
                        "order": [25, 33, 57]
                    },
                    "type": "Ordinal",
                },
                "physical_type": age_cat_type_dict,
                "semantic_tags": ["category"],
                "description": "age of the user",
                "origin": "base",
                "metadata": {
                    "interesting_values": [33, 57]
                },
            },
            {
                "name": "signup_date",
                "ordinal": 5,
                "use_standard_tags": True,
                "logical_type": {
                    "parameters": {
                        "datetime_format": None,
                        "timezone": None
                    },
                    "type": "Datetime",
                },
                "physical_type": {
                    "type": "datetime64[ns]"
                },
                "semantic_tags": [],
                "description": "original signup date",
                "origin": "engineered",
                "metadata": {},
            },
            {
                "name": "is_registered",
                "ordinal": 6,
                "use_standard_tags": True,
                "logical_type": {
                    "parameters": {},
                    "type": "BooleanNullable"
                },
                "physical_type": {
                    "type": bool_val
                },
                "semantic_tags": [],
                "description": None,
                "origin": None,
                "metadata": {},
            },
            {
                "name": "double",
                "ordinal": 7,
                "use_standard_tags": True,
                "logical_type": {
                    "parameters": {},
                    "type": "Double"
                },
                "physical_type": {
                    "type": double_val
                },
                "semantic_tags": ["numeric"],
                "description": None,
                "origin": None,
                "metadata": {},
            },
            {
                "name": "double_with_nan",
                "ordinal": 8,
                "use_standard_tags": True,
                "logical_type": {
                    "parameters": {},
                    "type": "Double"
                },
                "physical_type": {
                    "type": double_val
                },
                "semantic_tags": ["numeric"],
                "description": None,
                "origin": None,
                "metadata": {},
            },
            {
                "name": "integer",
                "ordinal": 9,
                "use_standard_tags": True,
                "logical_type": {
                    "parameters": {},
                    "type": "Integer"
                },
                "physical_type": {
                    "type": int_val
                },
                "semantic_tags": ["numeric"],
                "description": None,
                "origin": None,
                "metadata": {},
            },
            {
                "name": "nullable_integer",
                "ordinal": 10,
                "use_standard_tags": True,
                "logical_type": {
                    "parameters": {},
                    "type": "IntegerNullable"
                },
                "physical_type": {
                    "type": nullable_int_val
                },
                "semantic_tags": ["numeric"],
                "description": None,
                "origin": None,
                "metadata": {},
            },
            {
                "name": "boolean",
                "ordinal": 11,
                "use_standard_tags": True,
                "logical_type": {
                    "parameters": {},
                    "type": "Boolean"
                },
                "physical_type": {
                    "type": "bool"
                },
                "semantic_tags": [],
                "description": None,
                "origin": None,
                "metadata": {},
            },
            {
                "name": "categorical",
                "ordinal": 12,
                "use_standard_tags": True,
                "logical_type": {
                    "parameters": {},
                    "type": "Categorical"
                },
                "physical_type": cat_type_dict,
                "semantic_tags": ["category"],
                "description": None,
                "origin": None,
                "metadata": {},
            },
            {
                "name": "datetime_with_NaT",
                "ordinal": 13,
                "use_standard_tags": True,
                "logical_type": {
                    "parameters": {
                        "datetime_format": None,
                        "timezone": None
                    },
                    "type": "Datetime",
                },
                "physical_type": {
                    "type": "datetime64[ns]"
                },
                "semantic_tags": [],
                "description": None,
                "origin": None,
                "metadata": {},
            },
            {
                "name": "url",
                "ordinal": 14,
                "use_standard_tags": True,
                "logical_type": {
                    "parameters": {},
                    "type": "URL"
                },
                "physical_type": {
                    "type": string_val
                },
                "semantic_tags": [],
                "description": None,
                "origin": None,
                "metadata": {},
            },
            {
                "name": "ip_address",
                "ordinal": 15,
                "use_standard_tags": True,
                "logical_type": {
                    "parameters": {},
                    "type": "IPAddress"
                },
                "physical_type": {
                    "type": string_val
                },
                "semantic_tags": [],
                "description": None,
                "origin": None,
                "metadata": {},
            },
        ],
        "loading_info": {
            "table_type": table_type
        },
        "table_metadata": {
            "date_created": "11/16/20"
        },
    }
    sample_df.ww.init(
        name="test_data",
        index="id",
        semantic_tags={"id": "tag1"},
        logical_types={"age": Ordinal(order=[25, 33, 57])},
        table_metadata={"date_created": "11/16/20"},
        column_descriptions={
            "signup_date": "original signup date",
            "age": "age of the user",
        },
        column_origins={
            "phone_number": "base",
            "age": "base",
            "signup_date": "engineered",
        },
        column_metadata={
            "id": {
                "is_sorted": True
            },
            "age": {
                "interesting_values": [33, 57]
            },
        },
    )

    description = sample_df.ww.to_dictionary()
    assert description == expected