コード例 #1
0
    def check_mostly_same(
        data: Union[pd.DataFrame, pd.Series],
        thresh: float = 0.95,
    ) -> pd.DataFrame:
        """Checks if binary data contains almost all the same value.

        Args:
            data: Binary data to be checked if almost all values are the same.
            thresh: Threshold for what proportion of data must be the same to fail check.

        Returns:
            DataFrame with bool(s) indicating if data contains all the same value, the
            value of threshold used to determine if mostly same, and the average value(s).

        Raises:
            ValueError: If `thresh` less than or equal to 0.0 or greater than or equal to 1.0.
        """
        _utils.validate_thresh(thresh)
        is_df = _utils.check_if_df(data)
        BinaryFeatures._validate_binary_dtype(data)
        if is_df:
            mean = data.mean(axis=0)
            result = (mean >= thresh) | (mean <= 1 - thresh)
        else:
            mean = data.mean()
            result = mean >= thresh or mean <= 1 - thresh
        return _utils.result_to_df(data=result,
                                   title='mostly_same',
                                   thresh=thresh,
                                   mean=mean)
コード例 #2
0
    def check_uniqueness(
        data: Union[pd.DataFrame, pd.Series], ) -> pd.DataFrame:
        """Checks if unique data contains columns with duplicates.

        Args:
            data: Data to be checked for duplicates.

        Returns:
            DataFrame with bool(s) indicating if data contains duplicates, the count of
            duplicates present, and the proportion of duplicates.

        Raises:
            ValueError: If unique data contains nulls.
        """
        UniqueFeatures._validate_unique_dtype(data)
        is_df = _utils.check_if_df(data)
        err_message = 'Columns with unique data should not contain nulls.'
        if is_df:
            if data.isna().any(axis=None):
                raise ValueError(err_message)
            count_dupes = data.nunique(axis=0).subtract(
                data.shape[0]).multiply(-1)
            is_dupes = count_dupes.astype(bool)
            prop_dupes = count_dupes.divide(data.shape[0])
        else:
            if data.isna().any():
                raise ValueError(err_message)
            count_dupes = data.shape[0] - data.nunique()
            is_dupes = bool(count_dupes)
            prop_dupes = count_dupes / data.shape[0]
        result = _utils.result_to_df(data=is_dupes,
                                     title='dupes_present',
                                     dupe_count=count_dupes,
                                     prop_dupe=prop_dupes)
        return result
コード例 #3
0
    def check_fuzzy_nulls(
        data: Union[pd.DataFrame, pd.Series],
        add_fuzzy_nulls: Optional[List] = None,
    ) -> pd.DataFrame:
        """Checks if DataFrame contains values commonly used to denote nulls (fuzzy nulls).

        Args:
            data: Data to be checked for fuzzy nulls.
            add_fuzzy_nulls: Additional items to check as fuzzy nulls.

        Returns:
            DataFrame with bool(s) indicating if data contains any fuzzy nulls, count of
            the fuzzy nulls present, and the proportion of fuzzy nulls.
        """
        is_df = _utils.check_if_df(data)
        fuzzy_nulls = ['null', 'Null', 'NULL', '', ' ']
        if add_fuzzy_nulls is not None:
            fuzzy_nulls.extend(add_fuzzy_nulls)
        is_fuzzy_nulls = data.isin(fuzzy_nulls).any(axis=0)
        count_fuzzy_nulls = data.isin(fuzzy_nulls).sum(axis=0)
        if is_df:
            prop_fuzzy_nulls = count_fuzzy_nulls.divide(data.shape[0])
        else:
            prop_fuzzy_nulls = count_fuzzy_nulls / data.shape[0]
        result = _utils.result_to_df(
            data=is_fuzzy_nulls,
            title='fuzzy_nulls_present',
            fuzzy_null_count=count_fuzzy_nulls,
            prop_fuzzy_null=prop_fuzzy_nulls,
        )
        return result
コード例 #4
0
    def check_outside_range(
            data: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame:
        """Checks if binary data contains columns where min is less than 0 or max is greater than 1.

        Args:
            data: Binary data to be checked if any values are less than 0 or greater than 1.

        Returns:
            DataFrame with bool(s) indicating if data contains any values outside of the expected range.
        """
        is_df = _utils.check_if_df(data)
        BinaryFeatures._validate_binary_dtype(data)
        if is_df:
            result = (data.min(axis=0) < 0) | (data.max(axis=0) > 1)
        else:
            result = data.min() < 0 or data.max() > 1
        return _utils.result_to_df(data=result, title='outside_range')
コード例 #5
0
    def check_all_same(
            data: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame]:
        """Checks if binary data contains all the same value.

        Args:
            data: Binary data to be checked if all values are the same.

        Returns:
            DataFrame with bool(s) indicating if data contains all the same value.
        """
        is_df = _utils.check_if_df(data)
        BinaryFeatures._validate_binary_dtype(data)
        if is_df:
            result = data.min(axis=0).eq(data.max(axis=0))
        else:
            result = data.min() == data.max()
        return _utils.result_to_df(result, title='all_same')
コード例 #6
0
    def check_n_categories(
        data: Union[pd.DataFrame, pd.Series],
        dropna: bool = False,
    ) -> pd.DataFrame:
        """Counts the number of categories.

        Args:
            data: Data to count categories for.
            dropna: If True: ignores nulls, if False: counts nulls as a category.

        Returns:
            DataFrame with count(s) of categories.
        """
        CategoricalFeatures._validate_categorical_dtype(data)
        is_df = _utils.check_if_df(data)
        if is_df:
            result = data.nunique(axis=0, dropna=dropna)
        else:
            result = data.nunique(dropna=dropna)
        return _utils.result_to_df(result, title='n_categories')
コード例 #7
0
    def _validate_unique_dtype(data: Union[pd.DataFrame, pd.Series]) -> None:
        """Validates that unique data contains only dtype object, int, or datetime.

        Args:
            data: Unique data to be validated.

        Returns:
            None

        Raises:
            TypeError: If `data` contains dtype other than object, int, or datetime.
        """
        is_df = _utils.check_if_df(data)
        err_message = 'Unique feature columns should be of type object, int64, or datetime64.'
        types = (np.dtype('O'), np.dtype(int), np.dtype('datetime64[ns]'))
        if is_df:
            if not data.dtypes.isin(types).all():
                raise TypeError(err_message)
        else:
            if data.dtypes not in types:
                raise TypeError(err_message)
        return
コード例 #8
0
    def _validate_binary_dtype(data: Union[pd.DataFrame, pd.Series]) -> None:
        """Validates that binary data contains only dtype bool or int.

        Args:
            data: Binary data to be type validated.

        Returns:
            None

        Raises:
            TypeError: If `data` contains dtype other than bool or int.
        """
        is_df = _utils.check_if_df(data)
        err_message = 'Binary feature columns should be of type bool or int64.'
        types = (np.dtype(bool), np.dtype(int))
        if is_df:
            if not data.dtypes.isin(types).all():
                raise TypeError(err_message)
        else:
            if data.dtypes not in types:
                raise TypeError(err_message)
        return
コード例 #9
0
    def check_mostly_same(
        data: Union[pd.DataFrame, pd.Series],
        thresh: float = 0.95,
        dropna: bool = False,
    ) -> pd.DataFrame:
        """Checks if categorical data contains almost all the same category.

        Args:
            data: Categorical data to be checked if almost all the same category.
            thresh: Threshold for what proportion of data must be the same category to fail check.
            dropna: If True: ignores nulls, if False: counts nulls as a category.

        Returns:
            DataFrame with bool(s) indicating if data contains almost all the same category, the
            value of threshold used to determine if mostly same, the most common category, the
            count of the most common category, and the proportion of the most common category.
        """
        _utils.validate_thresh(thresh)
        CategoricalFeatures._validate_categorical_dtype(data)
        is_df = _utils.check_if_df(data)
        if is_df:
            most_common = data.mode(axis=0, dropna=dropna).loc[0, :]
            count_common = data.eq(most_common).sum(axis=0)
            prop_common = count_common.divide(data.shape[0])
            mostly_same = prop_common.ge(thresh)
        else:
            most_common = data.mode()[0]
            count_common = data.eq(most_common).sum()
            prop_common = count_common / data.shape[0]
            mostly_same = prop_common >= thresh
        result = _utils.result_to_df(
            mostly_same,
            title='mostly_same',
            thresh=thresh,
            most_common=most_common,
            count=count_common,
            prop=prop_common,
        )
        return result
コード例 #10
0
    def check_nulls(data: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame:
        """Checks if data contains nulls.

        Args:
            data: Data to be checked for nulls.

        Returns:
            DataFrame with bool(s) indicating if data contains any nulls, count of the nulls
            present, and the proportion of nulls.
        """
        is_df = _utils.check_if_df(data)
        is_nulls = data.isna().any(axis=0)
        count_nulls = data.isna().sum(axis=0)
        if is_df:
            prop_nulls = count_nulls.divide(data.shape[0])
        else:
            prop_nulls = count_nulls / data.shape[0]
        result = _utils.result_to_df(data=is_nulls,
                                     title='nulls_present',
                                     null_count=count_nulls,
                                     prop_null=prop_nulls)
        return result
コード例 #11
0
    def _validate_categorical_dtype(
            data: Union[pd.DataFrame, pd.Series]) -> None:
        """Validates that categorical data contains only dtype int or object (str).

        Args:
            data: Categorical data to be type validated.

        Returns:
            None

        Raises:
            TypeError: If `data` contains dtype other than bool or object (str).
        """
        is_df = _utils.check_if_df(data)
        err_message = 'Unique feature columns should be of type object or int64.'
        types = (np.dtype('O'), np.dtype(int))
        if is_df:
            if not data.dtypes.isin(types).all():
                raise TypeError(err_message)
        else:
            if data.dtypes not in types:
                raise TypeError(err_message)
        return