コード例 #1
0
def _get_valid_mi_columns(dataframe, include_index=False):
    """Returns a list of columns from the DataFrame with valid
        Logical Types that support mutual information.

    Args:
        dataframe (pd.DataFrame): Data containing Woodwork typing information
            from which to calculate mutual information.
        include_index (bool): If True, the column specified as the index will be
            included as long as its LogicalType is valid for mutual information calculations.
            If False, the index column will not have mutual information calculated for it.
            Defaults to False.

    Returns:
        list: A list of column names that have valid Logical Types that support
        mutual information.
    """
    valid_types = tuple(get_valid_mi_types())
    valid_columns = [
        col_name for col_name, col in dataframe.ww.columns.items()
        if isinstance(col.logical_type, valid_types)
    ]
    if not include_index and dataframe.ww.index is not None:
        valid_columns.remove(dataframe.ww.index)

    return valid_columns
コード例 #2
0
ファイル: test_utils.py プロジェクト: kaidisn/woodwork
def test_get_valid_mi_types():
    valid_types = get_valid_mi_types()
    expected_types = [
        Boolean,
        Categorical,
        CountryCode,
        Datetime,
        Double,
        Integer,
        Ordinal,
        SubRegionCode,
        ZIPCode,
    ]

    assert valid_types == expected_types
コード例 #3
0
def test_get_valid_mi_types():
    valid_types = get_valid_mi_types()
    expected_types = [
        Boolean,
        BooleanNullable,
        Categorical,
        CountryCode,
        Datetime,
        Double,
        Integer,
        IntegerNullable,
        Ordinal,
        PostalCode,
        SubRegionCode,
    ]

    assert valid_types == expected_types
コード例 #4
0
def _get_dependence_dict(
    dataframe,
    measures,
    num_bins=10,
    nrows=None,
    include_index=False,
    callback=None,
    extra_stats=False,
    min_shared=25,
    random_seed=0,
):
    """Calculates dependence measures between all pairs of columns in the DataFrame that
    support measuring dependence. Supports boolean, categorical, datetime, and numeric data.
    Call woodwork.utils.get_valid_mi_types and woodwork.utils.get_valid_pearson_types
    for complete lists of supported Logical Types.

    Args:
        dataframe (pd.DataFrame): Data containing Woodwork typing information
            from which to calculate dependence.
        measures (list or str): Which dependence measures to calculate.
            A list of measures can be provided to calculate multiple
            measures at once.  Valid measure strings:

                - "pearson": calculates the Pearson correlation coefficient
                - "mutual_info": calculates the mutual information between columns
                - "max":  max(abs(pearson), mutual) for each pair of columns
                - "all": includes columns for "pearson", "mutual_info", and "max"
        num_bins (int): Determines number of bins to use for converting numeric
            features into categorical.  Default to 10. Pearson calculation does
            not use binning.
        nrows (int): The number of rows to sample for when determining dependence.
            If specified, samples the desired number of rows from the data.
            Defaults to using all rows.
        include_index (bool): If True, the column specified as the index will be
            included as long as its LogicalType is valid for measuring dependence.
            If False, the index column will not be considered. Defaults to False.
        callback (callable, optional): function to be called with incremental updates. Has the following parameters:

            - update (int): change in progress since last call
            - progress (int): the progress so far in the calculations
            - total (int): the total number of calculations to do
            - unit (str): unit of measurement for progress/total
            - time_elapsed (float): total time in seconds elapsed since start of call
        extra_stats (bool):  If True, additional column "shared_rows"
            recording the number of shared non-null rows for a column
            pair will be included with the dataframe.  If the "max"
            measure is being used, a "measure_used" column will be added
            that records whether Pearson or mutual information was the
            maximum dependence for a particular row. Defaults to False.
        min_shared (int): The number of shared non-null rows needed to
            calculate.  Less rows than this will be considered too sparse
            to measure accurately and will return a NaN value. Must be
            non-negative. Defaults to 25.
        random_seed (int): Seed for the random number generator. Defaults to 0.
    Returns:
        list(dict): A list containing dictionaries that have keys `column_1`,
        `column_2`, and keys for the specified dependence measures. The list is
        sorted in decending order by the first specified measure.
        Dependence information values are between 0 (no dependence) and 1
        (perfect dependency). For Pearson, values range from -1 to 1 but 0 is
        still no dependence.
    """
    start_time = timer()

    returned_measures, calc_order, calc_max = _parse_measures(measures)

    unit = "calculations"

    # get valid columns for dependence calculations
    if "pearson" in calc_order:
        pearson_types = get_valid_pearson_types()
        pearson_columns = _get_valid_columns(dataframe, pearson_types)
        valid_columns = pearson_columns
    if "mutual_info" in calc_order:
        mi_types = get_valid_mi_types()
        mutual_columns = _get_valid_columns(dataframe, mi_types)
        # pearson columns are a subset of mutual columns
        valid_columns = mutual_columns

    index = dataframe.ww.index
    if not include_index and index is not None and index in valid_columns:
        valid_columns.remove(index)

    data = dataframe.loc[:, valid_columns]
    # cut off data if necessary
    if _is_dask_dataframe(data):
        data = data.compute()
    elif _is_spark_dataframe(dataframe):
        data = data.to_pandas()
    if nrows is not None and nrows < data.shape[0]:
        data = data.sample(nrows, random_state=random_seed)

    notna_mask = data.notnull()
    not_null_cols = data.columns[notna_mask.any()]
    not_null_col_set = set(not_null_cols)
    if not_null_col_set != set(valid_columns):
        data = data.loc[:, not_null_cols]

    p = 0  # number of pearson columns
    m = 0  # number of mutual columns
    if "pearson" in calc_order:
        pearson_columns = [
            col for col in pearson_columns if col in not_null_col_set
        ]
        p = len(pearson_columns)
    if "mutual_info" in calc_order:
        mutual_columns = [
            col for col in mutual_columns if col in not_null_col_set
        ]
        m = len(mutual_columns)
    n = max(m, p)

    # combinations in a loop is n! / 2 / (n - 2)! which reduces to (n) (n - 1) / 2
    def _num_calc_steps(n):
        return (n * n - n) / 2

    # Assume 1 unit for preprocessing, n for handling nulls, m for binning numerics
    total_loops = 1 + n + m + _num_calc_steps(p) + _num_calc_steps(m)
    callback_caller = CallbackCaller(callback,
                                     unit,
                                     total_loops,
                                     start_time=start_time)
    callback_caller.update(1)

    # split dataframe into dict of series so we can drop nulls on a per-column basis
    data = {col: data[col].dropna() for col in data}

    # cast nullable type to non-nullable (needed for both pearson and mutual)
    _cast_nullable_int_and_datetime_to_int(data, dataframe.ww.columns)
    callback_caller.update(n)

    results = defaultdict(dict)

    for measure in calc_order:
        if measure == "mutual_info":
            _bin_numeric_cols_into_categories(dataframe.ww.schema, data,
                                              num_bins)
            callback_caller.update(n)
            col_names = mutual_columns
        elif measure == "pearson":
            col_names = pearson_columns

        _calculate_dependence_measure(
            measure=measure,
            data=data,
            results=results,
            callback_caller=callback_caller,
            notna_mask=notna_mask,
            min_shared=min_shared,
            col_names=col_names,
        )

    for result in results.values():
        if calc_max:
            _calculate_max_dependence_for_pair(
                result=result,
                min_shared=min_shared,
                extra_stats=extra_stats,
            )
            if returned_measures == ["max"]:
                # remove measurements not expected in returned dictionary
                del result["mutual_info"]
                if "pearson" in result:
                    del result["pearson"]

        # Remove cached info not expected in result by user
        if "num_union" in result:
            del result["num_union"]
        if not extra_stats:
            del result["shared_rows"]

    results = list(results.values())

    def sort_key(result):
        key = abs(result[returned_measures[0]])
        if np.isnan(key):
            key = -1
        return key

    results.sort(key=sort_key, reverse=True)

    return results
コード例 #5
0
def _get_mutual_information_dict(dataframe, num_bins=10, nrows=None, include_index=False):
    """Calculates mutual information between all pairs of columns in the DataFrame that
    support mutual information. Logical Types that support mutual information are as
    follows:  Boolean, Categorical, CountryCode, Datetime, Double, Integer, Ordinal,
    PostalCode, and SubRegionCode

    Args:
        dataframe (pd.DataFrame): Data containing Woodwork typing information
            from which to calculate mutual information.
        num_bins (int): Determines number of bins to use for converting
            numeric features into categorical.
        nrows (int): The number of rows to sample for when determining mutual info.
            If specified, samples the desired number of rows from the data.
            Defaults to using all rows.
        include_index (bool): If True, the column specified as the index will be
            included as long as its LogicalType is valid for mutual information calculations.
            If False, the index column will not have mutual information calculated for it.
            Defaults to False.

    Returns:
        list(dict): A list containing dictionaries that have keys `column_1`,
        `column_2`, and `mutual_info` that is sorted in decending order by mutual info.
        Mutual information values are between 0 (no mutual information) and 1
        (perfect dependency).
        """
    valid_types = get_valid_mi_types()
    valid_columns = [col_name for col_name, col in dataframe.ww.columns.items() if _get_ltype_class(col.logical_type) in valid_types]

    if not include_index and dataframe.ww.index is not None:
        valid_columns.remove(dataframe.ww.index)

    data = dataframe.loc[:, valid_columns]
    if dd and isinstance(data, dd.DataFrame):
        data = data.compute()
    if ks and isinstance(dataframe, ks.DataFrame):
        data = data.to_pandas()

    # cut off data if necessary
    if nrows is not None and nrows < data.shape[0]:
        data = data.sample(nrows)

    # remove fully null columns
    not_null_cols = data.columns[data.notnull().any()]
    if set(not_null_cols) != set(valid_columns):
        data = data.loc[:, not_null_cols]

    data = _replace_nans_for_mutual_info(dataframe.ww.schema, data)
    data = _make_categorical_for_mutual_info(dataframe.ww.schema, data, num_bins)

    # calculate mutual info for all pairs of columns
    mutual_info = []
    col_names = data.columns.to_list()
    for i, a_col in enumerate(col_names):
        for j in range(i, len(col_names)):
            b_col = col_names[j]
            if a_col == b_col:
                # Ignore because the mutual info for a column with itself will always be 1
                continue
            else:
                mi_score = normalized_mutual_info_score(data[a_col], data[b_col])
                mutual_info.append(
                    {"column_1": a_col, "column_2": b_col, "mutual_info": mi_score}
                )
    mutual_info.sort(key=lambda mi: mi['mutual_info'], reverse=True)
    return mutual_info