Exemple #1
0
    def _update_dtype(self):
        """Update the dtype of the underlying series to match the dtype corresponding
        to the LogicalType for the column."""
        if isinstance(self.logical_type, Ordinal):
            self.logical_type._validate_data(self._series)
        elif _get_ltype_class(self.logical_type) == LatLong:
            # Reformat LatLong columns to be a length two tuple (or list for Koalas) of floats
            if dd and isinstance(self._series, dd.Series):
                name = self._series.name
                meta = (self._series, tuple([float, float]))
                self._series = self._series.apply(_reformat_to_latlong,
                                                  meta=meta)
                self._series.name = name
            elif ks and isinstance(self._series, ks.Series):
                formatted_series = self._series.to_pandas().apply(
                    _reformat_to_latlong, use_list=True)
                self._series = ks.from_pandas(formatted_series)
            else:
                self._series = self._series.apply(_reformat_to_latlong)

        if self.logical_type.pandas_dtype != str(self._series.dtype):
            # Update the underlying series
            try:
                if _get_ltype_class(self.logical_type) == Datetime:
                    if dd and isinstance(self._series, dd.Series):
                        name = self._series.name
                        self._series = dd.to_datetime(
                            self._series,
                            format=self.logical_type.datetime_format)
                        self._series.name = name
                    elif ks and isinstance(self._series, ks.Series):
                        self._series = ks.Series(ks.to_datetime(
                            self._series.to_numpy(),
                            format=self.logical_type.datetime_format),
                                                 name=self._series.name)
                    else:
                        self._series = pd.to_datetime(
                            self._series,
                            format=self.logical_type.datetime_format)
                else:
                    if ks and isinstance(
                            self._series,
                            ks.Series) and self.logical_type.backup_dtype:
                        new_dtype = self.logical_type.backup_dtype
                    else:
                        new_dtype = self.logical_type.pandas_dtype
                    self._series = self._series.astype(new_dtype)
            except (TypeError, ValueError):
                error_msg = f'Error converting datatype for column {self.name} from type {str(self._series.dtype)} ' \
                    f'to type {self.logical_type.pandas_dtype}. Please confirm the underlying data is consistent with ' \
                    f'logical type {self.logical_type}.'
                raise TypeError(error_msg)
def _replace_nans_for_mutual_info(schema, data):
    """Replace NaN values in the dataframe so that mutual information can be calculated

    Args:
        schema (woodwork.TableSchema): Woodwork typing info for the data
        data (pd.DataFrame): dataframe to use for calculating mutual information

    Returns:
        pd.DataFrame: data with nans replaced with either mean or mode

    """
    for column_name in data.columns[data.isnull().any()]:
        column = schema.columns[column_name]
        series = data[column_name]

        if column.is_numeric or column.is_datetime:
            mean = series.mean()
            if isinstance(mean, float) and not _get_ltype_class(
                    column.logical_type) == Double:
                data[column_name] = series.astype('float')
            data[column_name] = series.fillna(mean)
        elif column.is_categorical or column.is_boolean:
            mode = _get_mode(series)
            data[column_name] = series.fillna(mode)
    return data
Exemple #3
0
def _validate_logical_type(logical_type):
    ltype_class = _get_ltype_class(logical_type)

    if ltype_class not in ww.type_system.registered_types:
        raise TypeError(
            f'logical_type {logical_type} is not a registered LogicalType.')
    if ltype_class == Ordinal and not isinstance(logical_type, Ordinal):
        raise TypeError(
            "Must use an Ordinal instance with order values defined")
Exemple #4
0
def write_table_data(datatable, path, format='csv', **kwargs):
    '''Write underlying datatable data to disk or S3 path.

    Args:
        datatable (DataTable) : Instance of :class:`.DataTable`.
        path (str) : Location on disk to write datatable data.
        format (str) : Format to use for writing datatable data. Defaults to csv.
        kwargs (keywords) : Additional keyword arguments to pass as keywords arguments to the underlying serialization method.

    Returns:
        loading_info (dict) : Information on storage location and format of datatable data.
    '''
    format = format.lower()

    dt_name = datatable.name or 'data'
    df = datatable.to_dataframe()

    if dd and isinstance(df, dd.DataFrame) and format == 'csv':
        basename = "{}-*.{}".format(dt_name, format)
    else:
        basename = '.'.join([dt_name, format])
    location = os.path.join('data', basename)
    file = os.path.join(path, location)

    if format == 'csv':
        compression = kwargs['compression']
        if ks and isinstance(df, ks.DataFrame):
            df = df.copy()
            columns = list(df.select_dtypes('object').columns)
            df[columns] = df[columns].astype(str)
            compression = str(compression)
        df.to_csv(file,
                  index=kwargs['index'],
                  sep=kwargs['sep'],
                  encoding=kwargs['encoding'],
                  compression=compression)
    elif format == 'pickle':
        # Dask and Koalas currently do not support to_pickle
        if not isinstance(df, pd.DataFrame):
            msg = 'DataFrame type not compatible with pickle serialization. Please serialize to another format.'
            raise ValueError(msg)
        df.to_pickle(file, **kwargs)
    elif format == 'parquet':
        # Latlong columns in pandas and Dask DataFrames contain tuples, which raises
        # an error in parquet format.
        df = df.copy()
        latlong_columns = [
            col_name for col_name, col in datatable.columns.items()
            if _get_ltype_class(col.logical_type) == ww.logical_types.LatLong
        ]
        df[latlong_columns] = df[latlong_columns].astype(str)

        df.to_parquet(file, **kwargs)
    else:
        error = 'must be one of the following formats: {}'
        raise ValueError(error.format(', '.join(FORMATS)))
    return {'location': location, 'type': format, 'params': kwargs}
Exemple #5
0
def typing_info_to_dict(dataframe):
    """Creates the description for a Woodwork table, including typing information for each column
    and loading information.

    Args:
        dataframe (pd.DataFrame, dd.Dataframe, ks.DataFrame): DataFrame with Woodwork typing
            information initialized.

    Returns:
        dict: Dictionary containing Woodwork typing information
    """
    if dd and isinstance(dataframe, dd.DataFrame):
        # Need to determine the category info for Dask it can be saved below
        category_cols = [colname for colname, col in dataframe.ww._schema.columns.items() if col.is_categorical]
        dataframe = dataframe.ww.categorize(columns=category_cols)
    ordered_columns = dataframe.columns
    column_typing_info = [
        {'name': col_name,
         'ordinal': ordered_columns.get_loc(col_name),
         'use_standard_tags': col.use_standard_tags,
         'logical_type': {
             'parameters': _get_specified_ltype_params(col.logical_type),
             'type': str(_get_ltype_class(col.logical_type))
         },
         'physical_type': {
             'type': str(dataframe[col_name].dtype),
             # Store categorical values so they can be recreated if they are modified during serialization
             'cat_values': dataframe[col_name].dtype.categories.to_list() if str(dataframe[col_name].dtype) == 'category' else None
         },
         'semantic_tags': sorted(list(col.semantic_tags)),
         'description': col.description,
         'metadata': col.metadata,
         }
        for col_name, col in dataframe.ww.columns.items()
    ]

    if dd and isinstance(dataframe, dd.DataFrame):
        table_type = 'dask'
    elif ks and isinstance(dataframe, ks.DataFrame):
        table_type = 'koalas'
    else:
        table_type = 'pandas'

    return {
        'schema_version': SCHEMA_VERSION,
        'name': dataframe.ww.name,
        'index': dataframe.ww.index,
        'time_index': dataframe.ww.time_index,
        'column_typing_info': column_typing_info,
        'loading_info': {
            'table_type': table_type
        },
        'table_metadata': dataframe.ww.metadata
    }
Exemple #6
0
def _check_time_index(column_names, time_index, logical_type):
    if time_index not in column_names:
        raise ColumnNotPresentError(
            f"Specified time index column `{time_index}` not found in TableSchema"
        )
    ltype_class = _get_ltype_class(logical_type)

    if not (ltype_class == ww.logical_types.Datetime
            or "numeric" in ltype_class.standard_tags):
        raise TypeError(
            "Time index column must be a Datetime or numeric column.")
Exemple #7
0
def _check_time_index(column_names, time_index, logical_type):
    if time_index not in column_names:
        raise LookupError(
            f'Specified time index column `{time_index}` not found in TableSchema'
        )
    ltype_class = _get_ltype_class(logical_type)

    if not (ltype_class == ww.logical_types.Datetime
            or 'numeric' in ltype_class.standard_tags):
        raise TypeError(
            'Time index column must be a Datetime or numeric column.')
Exemple #8
0
def clean_latlong(dataframe):
    """Convert latlong tuples to strings for parquet, arrow and feather file format.
    Attempting to serialize with tuples present results in an error"""
    latlong_columns = [
        col_name for col_name, col in dataframe.ww.columns.items()
        if _get_ltype_class(col.logical_type) == LatLong
    ]
    if len(latlong_columns) > 0:
        dataframe = dataframe.ww.copy()
        dataframe[latlong_columns] = dataframe[latlong_columns].astype(str)

    return dataframe
Exemple #9
0
 def _parse_logical_type(self, logical_type):
     if logical_type:
         if isinstance(logical_type, str):
             logical_type = ww.type_system.str_to_logical_type(logical_type)
         ltype_class = _get_ltype_class(logical_type)
         if ltype_class == Ordinal and not isinstance(
                 logical_type, Ordinal):
             raise TypeError(
                 "Must use an Ordinal instance with order values defined")
         if ltype_class in ww.type_system.registered_types:
             return logical_type
         else:
             raise TypeError(
                 f"Invalid logical type specified for '{self.name}'")
     else:
         return ww.type_system.infer_logical_type(self._series)
Exemple #10
0
def typing_info_to_dict(dataframe):
    """Creates the description for a Woodwork table, including typing information for each column
    and loading information.

    Args:
        dataframe (pd.DataFrame, dd.Dataframe, ks.DataFrame): DataFrame with Woodwork typing
            information initialized.

    Returns:
        dict: Dictionary containing Woodwork typing information
    """
    ordered_columns = dataframe.columns
    column_typing_info = [{
        'name': col_name,
        'ordinal': ordered_columns.get_loc(col_name),
        'use_standard_tags': col.use_standard_tags,
        'logical_type': {
            'parameters': _get_specified_ltype_params(col.logical_type),
            'type': str(_get_ltype_class(col.logical_type))
        },
        'physical_type': {
            'type': str(dataframe[col_name].dtype)
        },
        'semantic_tags': sorted(list(col.semantic_tags)),
        'description': col.description,
        'metadata': col.metadata,
    } for col_name, col in dataframe.ww.columns.items()]

    if dd and isinstance(dataframe, dd.DataFrame):
        table_type = 'dask'
    elif ks and isinstance(dataframe, ks.DataFrame):
        table_type = 'koalas'
    else:
        table_type = 'pandas'

    return {
        'schema_version': SCHEMA_VERSION,
        'name': dataframe.ww.name,
        'index': dataframe.ww.index,
        'time_index': dataframe.ww.time_index,
        'column_typing_info': column_typing_info,
        'loading_info': {
            'table_type': table_type
        },
        'table_metadata': dataframe.ww.metadata
    }
Exemple #11
0
def datatable_to_description(datatable):
    '''Gets the description for a DataTable, including typing information for each column
    and loading information.
    '''
    df = datatable.to_dataframe()
    ordered_columns = df.columns
    column_metadata = [{
        'name': col.name,
        'ordinal': ordered_columns.get_loc(col.name),
        'logical_type': {
            'parameters': _get_specified_ltype_params(col.logical_type),
            'type': str(_get_ltype_class(col.logical_type))
        },
        'physical_type': {
            'type': str(col.dtype)
        },
        'semantic_tags': sorted(list(col.semantic_tags)),
        'description': col.description,
        'metadata': col.metadata
    } for col in datatable.columns.values()]

    if dd and isinstance(df, dd.DataFrame):
        table_type = 'dask'
    elif ks and isinstance(df, ks.DataFrame):
        table_type = 'koalas'
    else:
        table_type = 'pandas'

    return {
        'schema_version': SCHEMA_VERSION,
        'name': datatable.name,
        'index': datatable.index,
        'time_index': datatable.time_index,
        'column_metadata': column_metadata,
        'loading_info': {
            'table_type': table_type
        },
        'table_metadata': datatable.metadata
    }
Exemple #12
0
def _check_logical_types(column_names, logical_types, require_all_cols=True):
    if not isinstance(logical_types, dict):
        raise TypeError("logical_types must be a dictionary")
    cols_in_ltypes = set(logical_types.keys())
    cols_in_schema = set(column_names)

    cols_not_found_in_schema = cols_in_ltypes.difference(cols_in_schema)
    if cols_not_found_in_schema:
        raise ColumnNotPresentError(
            "logical_types contains columns that are not present in "
            f"TableSchema: {sorted(list(cols_not_found_in_schema))}")
    cols_not_found_in_ltypes = cols_in_schema.difference(cols_in_ltypes)
    if cols_not_found_in_ltypes and require_all_cols:
        raise ColumnNotPresentError(
            f"logical_types is missing columns that are present in "
            f"TableSchema: {sorted(list(cols_not_found_in_ltypes))}")

    for col_name, logical_type in logical_types.items():
        if _get_ltype_class(
                logical_type) not in ww.type_system.registered_types:
            raise TypeError("Logical Types must be of the LogicalType class "
                            "and registered in Woodwork's type system. "
                            f"{logical_type} does not meet that criteria.")
Exemple #13
0
def _validate_accessor_params(dataframe, index, make_index, time_index,
                              logical_types, schema, use_standard_tags):
    _check_unique_column_names(dataframe)
    _check_use_standard_tags(use_standard_tags)
    if schema is not None:
        _check_schema(dataframe, schema)
    else:
        # We ignore these parameters if a schema is passed
        if index is not None or make_index:
            _check_index(dataframe, index, make_index)
        if logical_types:
            _check_logical_types(dataframe.columns, logical_types)
        if time_index is not None:
            datetime_format = None
            logical_type = None
            if logical_types is not None and time_index in logical_types:
                logical_type = logical_types[time_index]
                if _get_ltype_class(logical_types[time_index]) == Datetime:
                    datetime_format = logical_types[time_index].datetime_format

            _check_time_index(dataframe,
                              time_index,
                              datetime_format=datetime_format,
                              logical_type=logical_type)
Exemple #14
0
 def is_boolean(self):
     """Whether the ColumnSchema is a Boolean column"""
     ltype_class = _get_ltype_class(self.logical_type)
     return ltype_class == Boolean or ltype_class == BooleanNullable
Exemple #15
0
    def _filter_cols(self, include=None, exclude=None, col_names=False):
        """Return list of columns filtered with any of: semantic tags, LogicalTypes, column names

        Args:
            include (str or LogicalType or list[str or LogicalType]): parameter or list of parameters to
                filter columns by. Can be Logical Types or Semantic Tags. Columns that match will be
                included in the returned list of columns.
            exclude (str or LogicalType or list[str or LogicalType]): parameter or list of parameters to
                filter columns by. Can be Logical Types or Semantic Tags. Columns that match will be
                excluded from the returned list of columns.

            col_names (bool): Specifies whether to filter columns by name. Defaults to False.

        Returns:
            List[str] of column names that fit into filter.
        """
        assert not (include
                    and exclude), "Cannot specify both include and exclude"
        if include and not isinstance(include, list):
            include = [include]
        elif exclude and not isinstance(exclude, list):
            exclude = [exclude]

        if include is not None:
            selectors = include
        elif exclude is not None:
            selectors = exclude

        ltypes_used = set()
        ltypes_in_schema = {
            type(col.logical_type)
            for col in self.columns.values()
        }

        tags_used = set()
        tags_in_schema = {
            tag
            for col in self.columns.values() for tag in col.semantic_tags
        }

        col_name_matches = set()

        for selector in selectors:
            # Determine if the selector is a registered, uninstantiated LogicalType
            maybe_ltype = selector
            if isinstance(selector, str):
                # Convert possible string to LogicalType - unregistered LogicalTypes return None
                maybe_ltype = ww.type_system.str_to_logical_type(
                    selector, raise_error=False)
            # Get the class - unregistered LogicalTypes return LogicalTypeMetaClass
            maybe_ltype_class = _get_ltype_class(maybe_ltype)

            if maybe_ltype_class in ww.type_system.registered_types:
                if maybe_ltype not in ww.type_system.registered_types:
                    raise TypeError(
                        f"Invalid selector used in include: {maybe_ltype} cannot be instantiated"
                    )
                if maybe_ltype in ltypes_in_schema:
                    ltypes_used.add(maybe_ltype)
            elif maybe_ltype_class == ww.logical_types.LogicalType.__class__:
                raise TypeError(
                    f"Specified LogicalType selector {maybe_ltype} is not registered in Woodwork's type system."
                )

            # Hashability as a proxy for whether a selector is possibly a semantic tag or column name
            if not isinstance(selector, Hashable):
                raise TypeError(
                    f"Invalid selector used in include: {selector} must be a "
                    "string, uninstantiated and registered LogicalType, or valid column name"
                )
            # Determine if the selector is a semantic tag
            if selector in tags_in_schema:
                tags_used.add(selector)
            # Determine if the selector is a column name
            if col_names and selector in self.columns:
                col_name_matches.add(selector)

        cols_to_return = []
        cols_seen = set()
        for col_name, col in self.columns.items():
            is_match = (type(col.logical_type) in ltypes_used
                        or col.semantic_tags.intersection(tags_used)
                        or col_name in col_name_matches)
            if include is not None and is_match and col_name not in cols_seen:
                cols_to_return.append(col_name)
                cols_seen.add(col_name)
            elif exclude is not None and not is_match and col_name not in cols_seen:
                cols_to_return.append(col_name)
                cols_seen.add(col_name)

        return cols_to_return
Exemple #16
0
def _is_col_boolean(col):
    return _get_ltype_class(col.logical_type) == Boolean
Exemple #17
0
def _is_col_datetime(col):
    return _get_ltype_class(col.logical_type) == Datetime
Exemple #18
0
def _get_describe_dict(dataframe, include=None):
    """Calculates statistics for data contained in a DataFrame using Woodwork typing information.

    Args:
        dataframe (pd.DataFrame): DataFrame to be described with Woodwork typing information initialized
        include (list[str or LogicalType], optional): filter for what columns to include in the
        statistics returned. Can be a list of column names, semantic tags, logical types, or a list
        combining any of the three. It follows the most broad specification. Favors logical types
        then semantic tag then column name. If no matching columns are found, an empty DataFrame
        will be returned.

    Returns:
        dict[str -> dict]: A dictionary with a key for each column in the data or for each column
        matching the logical types, semantic tags or column names specified in ``include``, paired
        with a value containing a dictionary containing relevant statistics for that column.
    """
    agg_stats_to_calculate = {
        'category': ["count", "nunique"],
        'numeric': ["count", "max", "min", "nunique", "mean", "std"],
        Datetime: ["count", "max", "min", "nunique", "mean"],
    }
    if include is not None:
        filtered_cols = dataframe.ww._filter_cols(include, col_names=True)
        cols_to_include = [(k, v) for k, v in dataframe.ww.columns.items() if k in filtered_cols]
    else:
        cols_to_include = dataframe.ww.columns.items()

    results = {}

    if dd and isinstance(dataframe, dd.DataFrame):
        df = dataframe.compute()
    elif ks and isinstance(dataframe, ks.DataFrame):
        df = dataframe.to_pandas()

        # Any LatLong columns will be using lists, which we must convert
        # back to tuples so we can calculate the mode, which requires hashable values
        latlong_columns = [col_name for col_name, col in dataframe.ww.columns.items() if _get_ltype_class(col.logical_type) == LatLong]
        df[latlong_columns] = df[latlong_columns].applymap(lambda latlong: tuple(latlong) if latlong else latlong)
    else:
        df = dataframe

    for column_name, column in cols_to_include:
        if 'index' in column.semantic_tags:
            continue
        values = {}
        logical_type = column.logical_type
        semantic_tags = column.semantic_tags
        series = df[column_name]

        # Calculate Aggregation Stats
        if _is_col_categorical(column):
            agg_stats = agg_stats_to_calculate['category']
        elif _is_col_numeric(column):
            agg_stats = agg_stats_to_calculate['numeric']
        elif _is_col_datetime(column):
            agg_stats = agg_stats_to_calculate[Datetime]
        else:
            agg_stats = ["count"]
        values = series.agg(agg_stats).to_dict()

        # Calculate other specific stats based on logical type or semantic tags
        if _is_col_boolean(column):
            values["num_false"] = series.value_counts().get(False, 0)
            values["num_true"] = series.value_counts().get(True, 0)
        elif _is_col_numeric(column):
            quant_values = series.quantile([0.25, 0.5, 0.75]).tolist()
            values["first_quartile"] = quant_values[0]
            values["second_quartile"] = quant_values[1]
            values["third_quartile"] = quant_values[2]

        mode = _get_mode(series)
        # The format of the mode should match its format in the DataFrame
        if ks and isinstance(dataframe, ks.DataFrame) and series.name in latlong_columns:
            mode = list(mode)

        values["nan_count"] = series.isna().sum()
        values["mode"] = mode
        values["physical_type"] = series.dtype
        values["logical_type"] = logical_type
        values["semantic_tags"] = semantic_tags
        results[column_name] = values
    return results
Exemple #19
0
def typing_info_to_dict(dataframe):
    """Creates the description for a Woodwork table, including typing information for each column
    and loading information.

    Args:
        dataframe (pd.DataFrame, dd.Dataframe, ks.DataFrame): DataFrame with Woodwork typing
            information initialized.

    Returns:
        dict: Dictionary containing Woodwork typing information
    """
    if _is_dask_dataframe(dataframe):
        # Need to determine the category info for Dask it can be saved below
        category_cols = [
            colname for colname, col in dataframe.ww._schema.columns.items()
            if col.is_categorical
        ]
        dataframe = dataframe.ww.categorize(columns=category_cols)
    ordered_columns = dataframe.columns

    def _get_physical_type_dict(column):
        type_dict = {"type": str(column.dtype)}
        if str(column.dtype) == "category":
            type_dict["cat_values"] = column.dtype.categories.to_list()
            type_dict["cat_dtype"] = str(column.dtype.categories.dtype)
        return type_dict

    column_typing_info = [{
        "name":
        col_name,
        "ordinal":
        ordered_columns.get_loc(col_name),
        "use_standard_tags":
        col.use_standard_tags,
        "logical_type": {
            "parameters": _get_specified_ltype_params(col.logical_type),
            "type": str(_get_ltype_class(col.logical_type)),
        },
        "physical_type":
        _get_physical_type_dict(dataframe[col_name]),
        "semantic_tags":
        sorted(list(col.semantic_tags)),
        "description":
        col.description,
        "origin":
        col.origin,
        "metadata":
        col.metadata,
    } for col_name, col in dataframe.ww.columns.items()]

    if _is_dask_dataframe(dataframe):
        table_type = "dask"
    elif _is_spark_dataframe(dataframe):
        table_type = "spark"
    else:
        table_type = "pandas"

    return {
        "schema_version": SCHEMA_VERSION,
        "name": dataframe.ww.name,
        "index": dataframe.ww.index,
        "time_index": dataframe.ww.time_index,
        "column_typing_info": column_typing_info,
        "loading_info": {
            "table_type": table_type
        },
        "table_metadata": dataframe.ww.metadata,
    }
Exemple #20
0
 def is_datetime(self):
     """Whether the ColumnSchema is a Datetime column"""
     return _get_ltype_class(self.logical_type) == Datetime
Exemple #21
0
def _get_mutual_information_dict(dataframe, num_bins=10, nrows=None, include_index=False):
    """Calculates mutual information between all pairs of columns in the DataFrame that
    support mutual information. Logical Types that support mutual information are as
    follows:  Boolean, Categorical, CountryCode, Datetime, Double, Integer, Ordinal,
    PostalCode, and SubRegionCode

    Args:
        dataframe (pd.DataFrame): Data containing Woodwork typing information
            from which to calculate mutual information.
        num_bins (int): Determines number of bins to use for converting
            numeric features into categorical.
        nrows (int): The number of rows to sample for when determining mutual info.
            If specified, samples the desired number of rows from the data.
            Defaults to using all rows.
        include_index (bool): If True, the column specified as the index will be
            included as long as its LogicalType is valid for mutual information calculations.
            If False, the index column will not have mutual information calculated for it.
            Defaults to False.

    Returns:
        list(dict): A list containing dictionaries that have keys `column_1`,
        `column_2`, and `mutual_info` that is sorted in decending order by mutual info.
        Mutual information values are between 0 (no mutual information) and 1
        (perfect dependency).
        """
    valid_types = get_valid_mi_types()
    valid_columns = [col_name for col_name, col in dataframe.ww.columns.items() if _get_ltype_class(col.logical_type) in valid_types]

    if not include_index and dataframe.ww.index is not None:
        valid_columns.remove(dataframe.ww.index)

    data = dataframe.loc[:, valid_columns]
    if dd and isinstance(data, dd.DataFrame):
        data = data.compute()
    if ks and isinstance(dataframe, ks.DataFrame):
        data = data.to_pandas()

    # cut off data if necessary
    if nrows is not None and nrows < data.shape[0]:
        data = data.sample(nrows)

    # remove fully null columns
    not_null_cols = data.columns[data.notnull().any()]
    if set(not_null_cols) != set(valid_columns):
        data = data.loc[:, not_null_cols]

    data = _replace_nans_for_mutual_info(dataframe.ww.schema, data)
    data = _make_categorical_for_mutual_info(dataframe.ww.schema, data, num_bins)

    # calculate mutual info for all pairs of columns
    mutual_info = []
    col_names = data.columns.to_list()
    for i, a_col in enumerate(col_names):
        for j in range(i, len(col_names)):
            b_col = col_names[j]
            if a_col == b_col:
                # Ignore because the mutual info for a column with itself will always be 1
                continue
            else:
                mi_score = normalized_mutual_info_score(data[a_col], data[b_col])
                mutual_info.append(
                    {"column_1": a_col, "column_2": b_col, "mutual_info": mi_score}
                )
    mutual_info.sort(key=lambda mi: mi['mutual_info'], reverse=True)
    return mutual_info