Exemple #1
0
def describe(df, bins=10, check_correlation=True, correlation_threshold=0.9, correlation_overrides=None, check_recoded=False, pool_size=multiprocessing.cpu_count(), **kwargs):
    """Generates a dict containing summary statistics for a given dataset stored as a pandas `DataFrame`.

    Used has is it will output its content as an HTML report in a Jupyter notebook.

    Parameters
    ----------
    df : DataFrame
        Data to be analyzed
    bins : int
        Number of bins in histogram.
        The default is 10.
    check_correlation : boolean
        Whether or not to check correlation.
        It's `True` by default.
    correlation_threshold: float
        Threshold to determine if the variable pair is correlated.
        The default is 0.9.
    correlation_overrides : list
        Variable names not to be rejected because they are correlated.
        There is no variable in the list (`None`) by default.
    check_recoded : boolean
        Whether or not to check recoded correlation (memory heavy feature).
        Since it's an expensive computation it can be activated for small datasets.
        `check_correlation` must be true to disable this check.
        It's `False` by default.
    pool_size : int
        Number of workers in thread pool
        The default is equal to the number of CPU.

    Returns
    -------
    dict
        Containing the following keys:
            * table: general statistics on the dataset
            * variables: summary statistics for each variable
            * freq: frequency table

    Notes:
    ------
        * The section dedicated to check the correlation should be externalized
    """

    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be of type pandas.DataFrame")
    if df.empty:
        raise ValueError("df can not be empty")

    try:
        # reset matplotlib style before use
        # Fails in matplotlib 1.4.x so plot might look bad
        matplotlib.style.use("default")
    except:
        pass

    matplotlib.style.use(resource_filename(__name__, "pandas_profiling.mplstyle"))

    # Clearing the cache before computing stats
    base.clear_cache()

    if not pd.Index(np.arange(0, len(df))).equals(df.index):
        # Treat index as any other column
        df = df.reset_index()

    kwargs.update({'bins': bins})
    # Describe all variables in a univariate way
    if pool_size == 1:
        local_multiprocess_func = partial(multiprocess_func, **kwargs)
        ldesc = {col: s for col, s in map(local_multiprocess_func, df.iteritems())}
    else:
        pool = multiprocessing.Pool(pool_size)
        local_multiprocess_func = partial(multiprocess_func, **kwargs)
        ldesc = {col: s for col, s in pool.map(local_multiprocess_func, df.iteritems())}
        pool.close()

    # Get correlations
    dfcorrPear = df.corr(method="pearson")
    dfcorrSpear = df.corr(method="spearman")

    # Check correlations between variable
    if check_correlation is True:
        ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9
        If x~y and y~z but not x~z, it would be better to delete only y
        Better way would be to find out which variable causes the highest increase in multicollinearity.
        '''
        corr = dfcorrPear.copy()
        for x, corr_x in corr.iterrows():
            if correlation_overrides and x in correlation_overrides:
                continue

            for y, corr in corr_x.iteritems():
                if x == y: break

                if corr > correlation_threshold:
                    ldesc[x] = pd.Series(['CORR', y, corr], index=['type', 'correlation_var', 'correlation'])

        if check_recoded:
            categorical_variables = [(name, data) for (name, data) in df.iteritems() if base.get_vartype(data)=='CAT']
            for (name1, data1), (name2, data2) in itertools.combinations(categorical_variables, 2):
                if correlation_overrides and name1 in correlation_overrides:
                    continue

                confusion_matrix=pd.crosstab(data1,data2)
                if confusion_matrix.values.diagonal().sum() == len(df):
                    ldesc[name1] = pd.Series(['RECODED', name2], index=['type', 'correlation_var'])

    # Convert ldesc to a DataFrame
    names = []
    ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)
    variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
    variable_stats.columns.names = df.columns.names

    # General statistics
    table_stats = {}

    table_stats['n'] = len(df)
    table_stats['nvar'] = len(df.columns)
    table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (table_stats['n'] * table_stats['nvar'])
    unsupported_columns = variable_stats.transpose()[variable_stats.transpose().type != base.S_TYPE_UNSUPPORTED].index.tolist()
    table_stats['n_duplicates'] = sum(df.duplicated(subset=unsupported_columns)) if len(unsupported_columns) > 0 else 0

    memsize = df.memory_usage(index=True).sum()
    table_stats['memsize'] = formatters.fmt_bytesize(memsize)
    table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n'])

    table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR", "RECODED", "BOOL", "UNSUPPORTED")})
    table_stats.update(dict(variable_stats.loc['type'].value_counts()))
    table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR'] + table_stats['RECODED']

    return {
        'table': table_stats,
        'variables': variable_stats.T,
        'freq': {k: (base.get_groupby_statistic(df[k])[0] if variable_stats[k].type != base.S_TYPE_UNSUPPORTED else None) for k in df.columns},
        'correlations': {'pearson': dfcorrPear, 'spearman': dfcorrSpear}
    }
Exemple #2
0
def describe(df, **kwargs):
    """
    Generates a object containing summary statistics for a given DataFrame
    :param df: DataFrame to be analyzed
    :param bins: Number of bins in histogram
    :return: Dictionary containing
        table: general statistics on the DataFrame
        variables: summary statistics for each variable
        freq: frequency table
    """

    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be of type pandas.DataFrame")
    if df.empty:
        raise ValueError("df can not be empty")

    bins = kwargs.get('bins', 10)

    try:
        # reset matplotlib style before use
        # Fails in matplotlib 1.4.x so plot might look bad
        matplotlib.style.use("default")
    except:
        pass

    matplotlib.style.use(resource_filename(__name__, "pandas_profiling.mplstyle"))

    def pretty_name(x):
        x *= 100
        if x == int(x):
            return '%.0f%%' % x
        else:
            return '%.1f%%' % x

    def describe_numeric_1d(series, base_stats):
        stats = {'mean': series.mean(), 'std': series.std(), 'variance': series.var(), 'min': series.min(),
                'max': series.max()}
        stats['range'] = stats['max'] - stats['min']

        for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]):
            stats[pretty_name(x)] = series.quantile(x)
        stats['iqr'] = stats['75%'] - stats['25%']
        stats['kurtosis'] = series.kurt()
        stats['skewness'] = series.skew()
        stats['sum'] = series.sum()
        stats['mad'] = series.mad()
        stats['cv'] = stats['std'] / stats['mean'] if stats['mean'] else np.NaN
        stats['type'] = "NUM"
        stats['n_zeros'] = (len(series) - np.count_nonzero(series))
        stats['p_zeros'] = stats['n_zeros'] / len(series)

        # Large histogram
        imgdata = BytesIO()
        plot = series.plot(kind='hist', figsize=(6, 4),
                           facecolor='#337ab7', bins=bins)  # TODO when running on server, send this off to a different thread
        plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0)
        plot.figure.savefig(imgdata)
        imgdata.seek(0)
        stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue()))
        #TODO Think about writing this to disk instead of caching them in strings
        plt.close(plot.figure)

        stats['mini_histogram'] = mini_histogram(series)

        return pd.Series(stats, name=series.name)

    def mini_histogram(series):
        # Small histogram
        imgdata = BytesIO()
        plot = series.plot(kind='hist', figsize=(2, 0.75), facecolor='#337ab7', bins=bins)
        plot.axes.get_yaxis().set_visible(False)
        plot.set_axis_bgcolor("w")
        xticks = plot.xaxis.get_major_ticks()
        for tick in xticks[1:-1]:
            tick.set_visible(False)
            tick.label.set_visible(False)
        for tick in (xticks[0], xticks[-1]):
            tick.label.set_fontsize(8)
        plot.figure.subplots_adjust(left=0.15, right=0.85, top=1, bottom=0.35, wspace=0, hspace=0)
        plot.figure.savefig(imgdata)
        imgdata.seek(0)
        result_string = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue()))
        plt.close(plot.figure)
        return result_string

    def describe_date_1d(series, base_stats):
        stats = {'min': series.min(), 'max': series.max()}
        stats['range'] = stats['max'] - stats['min']
        stats['type'] = "DATE"

        # TODO: Matplotlib can't do histograms of dates.
        # stats['mini_histogram'] = mini_histogram(series)

        return pd.Series(stats, name=series.name)

    def describe_categorical_1d(data):
        # Only run if at least 1 non-missing value
        objcounts = data.value_counts()
        top, freq = objcounts.index[0], objcounts.iloc[0]
        names = []
        result = []

        if data.dtype == object or com.is_categorical_dtype(data.dtype):
            names += ['top', 'freq', 'type']
            result += [top, freq, 'CAT']

        return pd.Series(result, index=names, name=data.name)

    def describe_constant_1d(data):
        return pd.Series(['CONST'], index=['type'], name=data.name)

    def describe_unique_1d(data):
        return pd.Series(['UNIQUE'], index=['type'], name=data.name)

    def describe_1d(data):
        count = data.count()
        leng = len(data)
        distinct_count = data.nunique(dropna=False)
        if count > distinct_count > 1:
            mode = data.mode().iloc[0]
        else:
            mode = data[0]

        results_data = {'count': count, 'distinct_count': distinct_count, 'p_missing': 1 - count / leng,
                        'n_missing': leng - count,
                        'is_unique': distinct_count == leng,
                        'mode': mode,
                        'p_unique': distinct_count / count}
        try:
            # pandas 0.17 onwards
            results_data['memorysize'] = data.memory_usage()
        except:
            results_data['memorysize'] = 0

        result = pd.Series(results_data, name=data.name)

        if distinct_count <= 1:
            result = result.append(describe_constant_1d(data))
        elif com.is_numeric_dtype(data):
            result = result.append(describe_numeric_1d(data, result))
        elif com.is_datetime64_dtype(data):
            result = result.append(describe_date_1d(data, result))
        elif distinct_count == leng:
            result = result.append(describe_unique_1d(data))
        else:
            result = result.append(describe_categorical_1d(data))
        return result

    if not pd.Index(np.arange(0, len(df))).equals(df.index):
        # Treat index as any other column
        df = df.reset_index()

    # Describe all variables in a univariate way
    ldesc = {col: describe_1d(s) for col, s in df.iteritems()}

    # Check correlations between variables
    ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9
    If x~y and y~z but not x~z, it would be better to delete only y
    Better way would be to find out which variable causes the highest increase in multicollinearity.
    '''
    corr = df.corr()
    for x, corr_x in corr.iterrows():
        for y, corr in corr_x.iteritems():
            if x == y: break

            if corr > 0.9:
                ldesc[x] = pd.Series(['CORR', y, corr], index=['type', 'correlation_var', 'correlation'], name=x)

    # Convert ldesc to a DataFrame
    names = []
    ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)
    variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
    variable_stats.columns.names = df.columns.names

    # General statistics
    table_stats = {'n': len(df), 'nvar': len(df.columns)}
    table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (table_stats['n'] * table_stats['nvar'])
    table_stats['n_duplicates'] = sum(df.duplicated())

    memsize = df.memory_usage(index=True).sum()
    table_stats['memsize'] = formatters.fmt_bytesize(memsize)
    table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n'])

    table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR")})
    table_stats.update(dict(variable_stats.loc['type'].value_counts()))
    table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR']

    return {'table': table_stats, 'variables': variable_stats.T, 'freq': {k: df[k].value_counts() for k in df.columns}}
def describe(df,
             bins=10,
             correlation_overrides=None,
             pool_size=multiprocessing.cpu_count(),
             **kwargs):
    """
    Generates a object containing summary statistics for a given DataFrame
    :param df: DataFrame to be analyzed
    :param bins: Number of bins in histogram
    :param correlation_overrides: Variable names not to be rejected because they are correlated
    :param pool_size: Number of workers in thread pool
    :return: Dictionary containing
        table: general statistics on the DataFrame
        variables: summary statistics for each variable
        freq: frequency table
    """

    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be of type pandas.DataFrame")
    if df.empty:
        raise ValueError("df can not be empty")

    try:
        # reset matplotlib style before use
        # Fails in matplotlib 1.4.x so plot might look bad
        matplotlib.style.use("default")
    except:
        pass

    matplotlib.style.use(
        resource_filename(__name__, "pandas_profiling.mplstyle"))

    def pretty_name(x):
        x *= 100
        if x == int(x):
            return '%.0f%%' % x
        else:
            return '%.1f%%' % x

    def describe_numeric_1d(series, base_stats):
        stats = {
            'mean': series.mean(),
            'std': series.std(),
            'variance': series.var(),
            'min': series.min(),
            'max': series.max()
        }
        stats['range'] = stats['max'] - stats['min']

        for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]):
            stats[pretty_name(x)] = series.dropna().quantile(
                x
            )  # The dropna() is a workaround for https://github.com/pydata/pandas/issues/13098
        stats['iqr'] = stats['75%'] - stats['25%']
        stats['kurtosis'] = series.kurt()
        stats['skewness'] = series.skew()
        stats['sum'] = series.sum()
        stats['mad'] = series.mad()
        stats['cv'] = stats['std'] / stats['mean'] if stats['mean'] else np.NaN
        stats['type'] = "NUM"
        stats['n_zeros'] = (len(series) - np.count_nonzero(series))
        stats['p_zeros'] = stats['n_zeros'] / len(series)
        # Histograms
        stats['histogram'] = histogram(series)
        stats['mini_histogram'] = mini_histogram(series)
        return pd.Series(stats, name=series.name)

    def _plot_histogram(series,
                        figsize=(6, 4),
                        facecolor='#337ab7',
                        bins=bins):
        """Plot an histogram from the data and return the AxesSubplot object.

        Parameters
        ----------
        series: Series, default None
            The data to plot
        figsize: a tuple (width, height) in inches, default (6,4)
            The size of the figure.
        facecolor: str
            The color code.
        bins: int, default
            The number of equal-width bins in the given range.

        Returns
        -------
        matplotlib.AxesSubplot, The plot.
        """
        if com.is_datetime64_dtype(series):
            # TODO: These calls should be merged
            fig = plt.figure(figsize=figsize)
            plot = fig.add_subplot(111)
            plot.set_ylabel('Frequency')
            try:
                plot.hist(series.values, facecolor=facecolor, bins=bins)
            except TypeError:  # matplotlib 1.4 can't plot dates so will show empty plot instead
                pass
        else:
            plot = series.plot(
                kind='hist', figsize=figsize, facecolor=facecolor, bins=bins
            )  # TODO when running on server, send this off to a different thread
        return plot

    def histogram(series):
        """Plot an histogram of the data.

        Parameters
        ----------
        series: Series, default None
            The data to plot.

        Returns
        -------
        str, The resulting image encoded as a string.
        """
        imgdata = BytesIO()
        plot = _plot_histogram(series)
        plot.figure.subplots_adjust(left=0.15,
                                    right=0.95,
                                    top=0.9,
                                    bottom=0.1,
                                    wspace=0,
                                    hspace=0)
        plot.figure.savefig(imgdata)
        imgdata.seek(0)
        result_string = 'data:image/png;base64,' + quote(
            base64.b64encode(imgdata.getvalue()))
        # TODO Think about writing this to disk instead of caching them in strings
        plt.close(plot.figure)
        return result_string

    def mini_histogram(series):
        """Plot a small (mini) histogram of the data.

        Parameters
        ----------
        series: Series, default None
            The data to plot.

        Returns
        -------
        str, The resulting image encoded as a string.
        """
        imgdata = BytesIO()
        plot = _plot_histogram(series, figsize=(2, 0.75))
        plot.axes.get_yaxis().set_visible(False)
        plot.set_axis_bgcolor("w")
        xticks = plot.xaxis.get_major_ticks()
        for tick in xticks[1:-1]:
            tick.set_visible(False)
            tick.label.set_visible(False)
        for tick in (xticks[0], xticks[-1]):
            tick.label.set_fontsize(8)
        plot.figure.subplots_adjust(left=0.15,
                                    right=0.85,
                                    top=1,
                                    bottom=0.35,
                                    wspace=0,
                                    hspace=0)
        plot.figure.savefig(imgdata)
        imgdata.seek(0)
        result_string = 'data:image/png;base64,' + quote(
            base64.b64encode(imgdata.getvalue()))
        plt.close(plot.figure)
        return result_string

    def describe_date_1d(series, base_stats):
        stats = {'min': series.min(), 'max': series.max()}
        stats['range'] = stats['max'] - stats['min']
        stats['type'] = "DATE"
        stats['histogram'] = histogram(series)
        stats['mini_histogram'] = mini_histogram(series)
        return pd.Series(stats, name=series.name)

    def describe_categorical_1d(data):
        # Only run if at least 1 non-missing value
        objcounts = data.value_counts()
        top, freq = objcounts.index[0], objcounts.iloc[0]
        names = []
        result = []

        if data.dtype == object or com.is_categorical_dtype(data.dtype):
            names += ['top', 'freq']
            result += [top, freq]

        return pd.Series(result, index=names, name=data.name)

    def describe_constant_1d(data):
        return pd.Series(['CONST'], index=['type'], name=data.name)

    def describe_unique_1d(data):
        return pd.Series(['UNIQUE'], index=['type'], name=data.name)

    def describe_1d(data):
        leng = len(data)  # number of observations in the Series
        count = data.count()  # number of non-NaN observations in the Series

        # Replace infinite values with NaNs to avoid issues with
        # histograms later.
        data.replace(to_replace=[np.inf, np.NINF, np.PINF],
                     value=np.nan,
                     inplace=True)

        n_infinite = count - data.count(
        )  # number of infinte observations in the Series

        distinct_count = data.nunique(
            dropna=False)  # number of unique elements in the Series
        if count > distinct_count > 1:
            mode = data.mode().iloc[0]
        else:
            mode = data[0]

        results_data = {
            'count': count,
            'distinct_count': distinct_count,
            'p_missing': 1 - count / leng,
            'n_missing': leng - count,
            'p_infinite': n_infinite / leng,
            'n_infinite': n_infinite,
            'is_unique': distinct_count == leng,
            'mode': mode,
            'p_unique': distinct_count / count
        }
        try:
            # pandas 0.17 onwards
            results_data['memorysize'] = data.memory_usage()
        except:
            results_data['memorysize'] = 0

        result = pd.Series(results_data, name=data.name)

        if distinct_count <= 1:
            result = result.append(describe_constant_1d(data))
        elif com.is_numeric_dtype(data):
            result = result.append(describe_numeric_1d(data, result))
        elif com.is_datetime64_dtype(data):
            result = result.append(describe_date_1d(data, result))
        elif distinct_count == leng:
            result = result.append(describe_unique_1d(data))
        else:
            result = result.append(describe_categorical_1d(data))
            result['type'] = 'CAT'

        return result

    if not pd.Index(np.arange(0, len(df))).equals(df.index):
        # Treat index as any other column
        df = df.reset_index()

    # Describe all variables in a univariate way
    pool = multiprocessing.Pool(pool_size)
    local_multiprocess_func = partial(multiprocess_func, **kwargs)
    ldesc = {
        col: s
        for col, s in pool.map(local_multiprocess_func, df.iteritems())
    }
    pool.close()

    # Check correlations between variables
    ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9
    If x~y and y~z but not x~z, it would be better to delete only y
    Better way would be to find out which variable causes the highest increase in multicollinearity.
    '''
    corr = df.corr()
    for x, corr_x in corr.iterrows():
        if correlation_overrides and x in correlation_overrides:
            continue

        for y, corr in corr_x.iteritems():
            if x == y: break

            if corr > 0.9:
                ldesc[x] = pd.Series(
                    ['CORR', y, corr],
                    index=['type', 'correlation_var', 'correlation'],
                    name=x)

    # Convert ldesc to a DataFrame
    names = []
    ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)
    variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
    variable_stats.columns.names = df.columns.names

    # General statistics
    table_stats = {'n': len(df), 'nvar': len(df.columns)}
    table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (
        table_stats['n'] * table_stats['nvar'])
    table_stats['n_duplicates'] = sum(df.duplicated())

    memsize = df.memory_usage(index=True).sum()
    table_stats['memsize'] = formatters.fmt_bytesize(memsize)
    table_stats['recordsize'] = formatters.fmt_bytesize(memsize /
                                                        table_stats['n'])

    table_stats.update(
        {k: 0
         for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR")})
    table_stats.update(dict(variable_stats.loc['type'].value_counts()))
    table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR']

    return {
        'table': table_stats,
        'variables': variable_stats.T,
        'freq': {k: df[k].value_counts()
                 for k in df.columns}
    }
Exemple #4
0
def describe(df, **kwargs):
    """
    Generates a object containing summary statistics for a given DataFrame
    :param df: DataFrame to be analyzed
    :param bins: Number of bins in histogram
    :return: Dictionary containing
        table: general statistics on the DataFrame
        variables: summary statistics for each variable
        freq: frequency table
    """

    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be of type pandas.DataFrame")
    if df.empty:
        raise ValueError("df can not be empty")

    bins = kwargs.get('bins', 10)

    try:
        # reset matplotlib style before use
        # Fails in matplotlib 1.4.x so plot might look bad
        matplotlib.style.use("default")
    except:
        pass

    matplotlib.style.use(
        resource_filename(__name__, "pandas_profiling.mplstyle"))

    def pretty_name(x):
        x *= 100
        if x == int(x):
            return '%.0f%%' % x
        else:
            return '%.1f%%' % x

    def describe_numeric_1d(series, base_stats):
        stats = {
            'mean': series.mean(),
            'std': series.std(),
            'variance': series.var(),
            'min': series.min(),
            'max': series.max()
        }
        stats['range'] = stats['max'] - stats['min']

        for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]):
            stats[pretty_name(x)] = series.quantile(x)
        stats['iqr'] = stats['75%'] - stats['25%']
        stats['kurtosis'] = series.kurt()
        stats['skewness'] = series.skew()
        stats['sum'] = series.sum()
        stats['mad'] = series.mad()
        stats['cv'] = stats['std'] / stats['mean'] if stats['mean'] else np.NaN
        stats['type'] = "NUM"
        stats['n_zeros'] = (len(series) - np.count_nonzero(series))
        stats['p_zeros'] = stats['n_zeros'] / len(series)

        # Large histogram
        imgdata = BytesIO()
        plot = series.plot(
            kind='hist', figsize=(6, 4), facecolor='#337ab7', bins=bins
        )  # TODO when running on server, send this off to a different thread
        plot.figure.subplots_adjust(left=0.15,
                                    right=0.95,
                                    top=0.9,
                                    bottom=0.1,
                                    wspace=0,
                                    hspace=0)
        plot.figure.savefig(imgdata)
        imgdata.seek(0)
        stats['histogram'] = 'data:image/png;base64,' + quote(
            base64.b64encode(imgdata.getvalue()))
        #TODO Think about writing this to disk instead of caching them in strings
        plt.close(plot.figure)

        stats['mini_histogram'] = mini_histogram(series)

        return pd.Series(stats, name=series.name)

    def mini_histogram(series):
        # Small histogram
        imgdata = BytesIO()
        plot = series.plot(kind='hist',
                           figsize=(2, 0.75),
                           facecolor='#337ab7',
                           bins=bins)
        plot.axes.get_yaxis().set_visible(False)
        plot.set_axis_bgcolor("w")
        xticks = plot.xaxis.get_major_ticks()
        for tick in xticks[1:-1]:
            tick.set_visible(False)
            tick.label.set_visible(False)
        for tick in (xticks[0], xticks[-1]):
            tick.label.set_fontsize(8)
        plot.figure.subplots_adjust(left=0.15,
                                    right=0.85,
                                    top=1,
                                    bottom=0.35,
                                    wspace=0,
                                    hspace=0)
        plot.figure.savefig(imgdata)
        imgdata.seek(0)
        result_string = 'data:image/png;base64,' + quote(
            base64.b64encode(imgdata.getvalue()))
        plt.close(plot.figure)
        return result_string

    def describe_date_1d(series, base_stats):
        stats = {'min': series.min(), 'max': series.max()}
        stats['range'] = stats['max'] - stats['min']
        stats['type'] = "DATE"

        # TODO: Matplotlib can't do histograms of dates.
        # stats['mini_histogram'] = mini_histogram(series)

        return pd.Series(stats, name=series.name)

    def describe_categorical_1d(data):
        # Only run if at least 1 non-missing value
        objcounts = data.value_counts()
        top, freq = objcounts.index[0], objcounts.iloc[0]
        names = []
        result = []

        if data.dtype == object or com.is_categorical_dtype(data.dtype):
            names += ['top', 'freq', 'type']
            result += [top, freq, 'CAT']

        return pd.Series(result, index=names, name=data.name)

    def describe_constant_1d(data):
        return pd.Series(['CONST'], index=['type'], name=data.name)

    def describe_unique_1d(data):
        return pd.Series(['UNIQUE'], index=['type'], name=data.name)

    def describe_1d(data):
        count = data.count()
        leng = len(data)
        distinct_count = data.nunique(dropna=False)
        if count > distinct_count > 1:
            mode = data.mode().iloc[0]
        else:
            mode = data[0]

        results_data = {
            'count': count,
            'distinct_count': distinct_count,
            'p_missing': 1 - count / leng,
            'n_missing': leng - count,
            'is_unique': distinct_count == leng,
            'mode': mode,
            'p_unique': distinct_count / count
        }
        try:
            # pandas 0.17 onwards
            results_data['memorysize'] = data.memory_usage()
        except:
            results_data['memorysize'] = 0

        result = pd.Series(results_data, name=data.name)

        if distinct_count <= 1:
            result = result.append(describe_constant_1d(data))
        elif com.is_numeric_dtype(data):
            result = result.append(describe_numeric_1d(data, result))
        elif com.is_datetime64_dtype(data):
            result = result.append(describe_date_1d(data, result))
        elif distinct_count == leng:
            result = result.append(describe_unique_1d(data))
        else:
            result = result.append(describe_categorical_1d(data))
        return result

    if not pd.Index(np.arange(0, len(df))).equals(df.index):
        # Treat index as any other column
        df = df.reset_index()

    # Describe all variables in a univariate way
    ldesc = {col: describe_1d(s) for col, s in df.iteritems()}

    # Check correlations between variables
    ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9
    If x~y and y~z but not x~z, it would be better to delete only y
    Better way would be to find out which variable causes the highest increase in multicollinearity.
    '''
    corr = df.corr()
    for x, corr_x in corr.iterrows():
        for y, corr in corr_x.iteritems():
            if x == y: break

            if corr > 0.9:
                ldesc[x] = pd.Series(
                    ['CORR', y, corr],
                    index=['type', 'correlation_var', 'correlation'],
                    name=x)

    # Convert ldesc to a DataFrame
    names = []
    ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)
    variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
    variable_stats.columns.names = df.columns.names

    # General statistics
    table_stats = {'n': len(df), 'nvar': len(df.columns)}
    table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (
        table_stats['n'] * table_stats['nvar'])
    table_stats['n_duplicates'] = sum(df.duplicated())

    memsize = df.memory_usage(index=True).sum()
    table_stats['memsize'] = formatters.fmt_bytesize(memsize)
    table_stats['recordsize'] = formatters.fmt_bytesize(memsize /
                                                        table_stats['n'])

    table_stats.update(
        {k: 0
         for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR")})
    table_stats.update(dict(variable_stats.loc['type'].value_counts()))
    table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR']

    return {
        'table': table_stats,
        'variables': variable_stats.T,
        'freq': {k: df[k].value_counts()
                 for k in df.columns}
    }
Exemple #5
0
def describe(df,
             bins=10,
             correlation_overrides=None,
             pool_size=multiprocessing.cpu_count(),
             **kwargs):
    """
    Generates a object containing summary statistics for a given DataFrame
    :param df: DataFrame to be analyzed
    :param bins: Number of bins in histogram
    :param correlation_overrides: Variable names not to be rejected because they are correlated
    :param pool_size: Number of workers in thread pool
    :return: Dictionary containing
        table: general statistics on the DataFrame
        variables: summary statistics for each variable
        freq: frequency table
    """

    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be of type pandas.DataFrame")
    if df.empty:
        raise ValueError("df can not be empty")

    try:
        # reset matplotlib style before use
        # Fails in matplotlib 1.4.x so plot might look bad
        matplotlib.style.use("default")
    except:
        pass

    matplotlib.style.use(
        resource_filename(__name__, "pandas_profiling.mplstyle"))

    if not pd.Index(np.arange(0, len(df))).equals(df.index):
        # Treat index as any other column
        df = df.reset_index()

    # Describe all variables in a univariate way
    pool = multiprocessing.Pool(pool_size)
    local_multiprocess_func = partial(multiprocess_func, **kwargs)
    ldesc = {
        col: s
        for col, s in pool.map(local_multiprocess_func, df.iteritems())
    }
    pool.close()

    # Check correlations between variables
    ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9
    If x~y and y~z but not x~z, it would be better to delete only y
    Better way would be to find out which variable causes the highest increase in multicollinearity.
    '''
    corr = df.corr()
    for x, corr_x in corr.iterrows():
        if correlation_overrides and x in correlation_overrides:
            continue

        for y, corr in corr_x.iteritems():
            if x == y: break

            if corr > 0.9:
                ldesc[x] = pd.Series(
                    ['CORR', y, corr],
                    index=['type', 'correlation_var', 'correlation'])

    categorical_variables = [(name, data) for (name, data) in df.iteritems()
                             if get_vartype(data) == 'CAT']
    for (name1,
         data1), (name2,
                  data2) in itertools.combinations(categorical_variables, 2):
        if correlation_overrides and name1 in correlation_overrides:
            continue

        confusion_matrix = pd.crosstab(data1, data2)
        if confusion_matrix.values.diagonal().sum() == len(df):
            ldesc[name1] = pd.Series(['RECODED', name2],
                                     index=['type', 'correlation_var'])

    # Convert ldesc to a DataFrame
    names = []
    ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)
    variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
    variable_stats.columns.names = df.columns.names

    # General statistics
    table_stats = {'n': len(df), 'nvar': len(df.columns)}
    table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (
        table_stats['n'] * table_stats['nvar'])
    table_stats['n_duplicates'] = sum(df.duplicated())

    memsize = df.memory_usage(index=True).sum()
    table_stats['memsize'] = formatters.fmt_bytesize(memsize)
    table_stats['recordsize'] = formatters.fmt_bytesize(memsize /
                                                        table_stats['n'])

    table_stats.update({
        k: 0
        for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR", "RECODED")
    })
    table_stats.update(dict(variable_stats.loc['type'].value_counts()))
    table_stats['REJECTED'] = table_stats['CONST'] + table_stats[
        'CORR'] + table_stats['RECODED']

    return {
        'table': table_stats,
        'variables': variable_stats.T,
        'freq': {k: df[k].value_counts()
                 for k in df.columns}
    }
Exemple #6
0
def describe(df):
    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be of type pandas.DataFrame")
    if df.empty:
        raise ValueError("df can not be empty")

    # reset matplotlib style before use
    matplotlib.style.use("default")
    matplotlib.style.use(resource_filename(__name__, "pandas_profiling.mplstyle"))

    def pretty_name(x):
        x *= 100
        if x == int(x):
            return '%.0f%%' % x
        else:
            return '%.1f%%' % x

    def describe_numeric_1d(series, base_stats):
        stats = {'mean': series.mean(), 'std': series.std(), 'variance': series.var(), 'min': series.min(),
                'max': series.max()}
        stats['range'] = stats['max'] - stats['min']

        for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]):
            stats[pretty_name(x)] = series.quantile(x)
        stats['iqr'] = stats['75%'] - stats['25%']
        stats['kurtosis'] = series.kurt()
        stats['skewness'] = series.skew()
        stats['sum'] = series.sum()
        stats['mad'] = series.mad()
        stats['cv'] = stats['std'] / stats['mean'] if stats['mean'] else np.NaN
        stats['type'] = "NUM"
        stats['p_zeros'] = (len(series) - np.count_nonzero(series)) / len(series)

        # Large histogram
        imgdata = BytesIO()
        plot = series.plot(kind='hist', figsize=(6, 4),
                           facecolor='#337ab7')  # TODO when running on server, send this off to a different thread
        plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0)
        plot.figure.savefig(imgdata)
        imgdata.seek(0)
        stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue()))
        #TODO Think about writing this to disk instead of caching them in strings
        plt.close(plot.figure)

        stats['mini_histogram'] = mini_histogram(series)

        return pd.Series(stats, name=series.name)

    def mini_histogram(series):
        # Small histogram
        imgdata = BytesIO()
        plot = series.plot(kind='hist', figsize=(2, 0.75), facecolor='#337ab7')
        plot.axes.get_yaxis().set_visible(False)
        plot.set_axis_bgcolor("w")
        xticks = plot.xaxis.get_major_ticks()
        for tick in xticks[1:-1]:
            tick.set_visible(False)
            tick.label.set_visible(False)
        for tick in (xticks[0], xticks[-1]):
            tick.label.set_fontsize(8)
        plot.figure.subplots_adjust(left=0.15, right=0.85, top=1, bottom=0.35, wspace=0, hspace=0)
        plot.figure.savefig(imgdata)
        imgdata.seek(0)
        result_string = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue()))
        plt.close(plot.figure)
        return result_string

    def describe_date_1d(series, base_stats):
        stats = {'min': series.min(), 'max': series.max()}
        stats['range'] = stats['max'] - stats['min']
        stats['type'] = "DATE"

        # TODO: Matplotlib can't do dates of histograms.
        # stats['mini_histogram'] = mini_histogram(series)

        return pd.Series(stats, name=series.name)

    def describe_categorical_1d(data):
        # Only run if at least 1 non-missing value
        objcounts = data.value_counts()
        top, freq = objcounts.index[0], objcounts.iloc[0]
        names = []
        result = []

        if data.dtype == object or com.is_categorical_dtype(data.dtype):
            names += ['top', 'freq', 'type']
            result += [top, freq, 'CAT']

        return pd.Series(result, index=names, name=data.name)

    def describe_constant_1d(data):
        return pd.Series(['CONST'], index=['type'], name=data.name)

    def describe_unique_1d(data):
        return pd.Series(['UNIQUE'], index=['type'], name=data.name)

    def describe_1d(data):
        # Is unique
        # Percent missing
        names = ['count', 'distinct_count', 'p_missing', 'n_missing', 'is_unique', 'mode', 'p_unique', 'memorysize']
        count = data.count()
        leng = len(data)
        distinct_count = data.nunique(dropna=False)
        if count > distinct_count > 1:
            mode = data.mode().iloc[0]
        else:
            mode = data[0]

        results_data = [count, distinct_count, 1 - count / leng, leng - count, distinct_count == leng, mode,
                        distinct_count / count, data.memory_usage()]
        result = pd.Series(results_data, index=names, name=data.name)

        if distinct_count <= 1:
            result = result.append(describe_constant_1d(data))
        elif com.is_numeric_dtype(data):
            result = result.append(describe_numeric_1d(data, result))
        elif com.is_datetime64_dtype(data):
            result = result.append(describe_date_1d(data, result))
        elif distinct_count == leng:
            result = result.append(describe_unique_1d(data))
        else:
            result = result.append(describe_categorical_1d(data))
        return result

    if not pd.Index(np.arange(0, len(df))).equals(df.index):
        # Treat index as any other column
        df = df.reset_index()

    ldesc = [describe_1d(s) for _, s in df.iteritems()]
    # set a convenient order for rows
    names = []
    ldesc_indexes = sorted([x.index for x in ldesc], key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)
    variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
    variable_stats.columns.names = df.columns.names

    table_stats = {'n': len(df), 'nvar': len(df.columns)}
    table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (table_stats['n'] * table_stats['nvar'])
    table_stats['n_duplicates'] = sum(df.duplicated())

    memsize = df.memory_usage(index=True).sum()
    table_stats['memsize'] = formatters.fmt_bytesize(memsize)
    table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n'])

    table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE")})
    table_stats.update(dict(variable_stats.loc['type'].value_counts()))

    return {'table': table_stats, 'variables': variable_stats.T, 'freq': {k: df[k].value_counts() for k in df.columns}}
Exemple #7
0
def describe(df, bins=10, check_correlation=True, correlation_overrides=None, pool_size=multiprocessing.cpu_count(), **kwargs):
    """Generates a dict containing summary statistics for a given dataset stored as a pandas `DataFrame`.

    Used has is it will output its content as an HTML report in a Jupyter notebook.

    Parameters
    ----------
    df : DataFrame
        Data to be analyzed
    bins : int
        Number of bins in histogram
    check_correlation : boolean
        Whether or not to check correlation.
    correlation_overrides : list
        Variable names not to be rejected because they are correlated
    pool_size: int
        Number of workers in thread pool

    Returns
    -------
    dict
        Containing the following keys:
            * table: general statistics on the dataset
            * variables: summary statistics for each variable
            * freq: frequency table

    Notes:
    ------
        * The section dedicated to check the correlation should be externalized
    """

    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be of type pandas.DataFrame")
    if df.empty:
        raise ValueError("df can not be empty")

    try:
        # reset matplotlib style before use
        # Fails in matplotlib 1.4.x so plot might look bad
        matplotlib.style.use("default")
    except:
        pass

    matplotlib.style.use(resource_filename(__name__, "pandas_profiling.mplstyle"))

    if not pd.Index(np.arange(0, len(df))).equals(df.index):
        # Treat index as any other column
        df = df.reset_index()

    # Describe all variables in a univariate way
    pool = multiprocessing.Pool(pool_size)
    local_multiprocess_func = partial(multiprocess_func, **kwargs)
    ldesc = {col: s for col, s in pool.map(local_multiprocess_func, df.iteritems())}
    pool.close()

    # Check correlations between variable
    if check_correlation is True:
        ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9
        If x~y and y~z but not x~z, it would be better to delete only y
        Better way would be to find out which variable causes the highest increase in multicollinearity.
        '''
        corr = df.corr()
        for x, corr_x in corr.iterrows():
            if correlation_overrides and x in correlation_overrides:
                continue

            for y, corr in corr_x.iteritems():
                if x == y: break

                if corr > 0.9:
                    ldesc[x] = pd.Series(['CORR', y, corr], index=['type', 'correlation_var', 'correlation'])

        categorical_variables = [(name, data) for (name, data) in df.iteritems() if base.get_vartype(data)=='CAT']
        for (name1, data1), (name2, data2) in itertools.combinations(categorical_variables, 2):
            if correlation_overrides and name1 in correlation_overrides:
                continue

            confusion_matrix=pd.crosstab(data1,data2)
            if confusion_matrix.values.diagonal().sum() == len(df):
                ldesc[name1] = pd.Series(['RECODED', name2], index=['type', 'correlation_var'])

    # Convert ldesc to a DataFrame
    names = []
    ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)
    variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
    variable_stats.columns.names = df.columns.names

    # General statistics
    table_stats = {'n': len(df), 'nvar': len(df.columns)}
    table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (table_stats['n'] * table_stats['nvar'])
    table_stats['n_duplicates'] = sum(df.duplicated())

    memsize = df.memory_usage(index=True).sum()
    table_stats['memsize'] = formatters.fmt_bytesize(memsize)
    table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n'])

    table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR", "RECODED", "BOOL")})
    table_stats.update(dict(variable_stats.loc['type'].value_counts()))
    table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR'] + table_stats['RECODED']

    return {'table': table_stats, 'variables': variable_stats.T, 'freq': {k: df[k].value_counts() for k in df.columns}}
Exemple #8
0
def describe(df, bins=10, correlation_overrides=None, pool_size=multiprocessing.cpu_count(), **kwargs):
    """
    Generates a object containing summary statistics for a given DataFrame
    :param df: DataFrame to be analyzed
    :param bins: Number of bins in histogram
    :param correlation_overrides: Variable names not to be rejected because they are correlated
    :param pool_size: Number of workers in thread pool
    :return: Dictionary containing
        table: general statistics on the DataFrame
        variables: summary statistics for each variable
        freq: frequency table
    """

    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be of type pandas.DataFrame")
    if df.empty:
        raise ValueError("df can not be empty")

    try:
        # reset matplotlib style before use
        # Fails in matplotlib 1.4.x so plot might look bad
        matplotlib.style.use("default")
    except:
        pass

    matplotlib.style.use(resource_filename(__name__, "pandas_profiling.mplstyle"))

    def pretty_name(x):
        x *= 100
        if x == int(x):
            return '%.0f%%' % x
        else:
            return '%.1f%%' % x

    def describe_numeric_1d(series, base_stats):
        stats = {'mean': series.mean(), 'std': series.std(), 'variance': series.var(), 'min': series.min(),
                'max': series.max()}
        stats['range'] = stats['max'] - stats['min']

        for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]):
            stats[pretty_name(x)] = series.dropna().quantile(x) # The dropna() is a workaround for https://github.com/pydata/pandas/issues/13098
        stats['iqr'] = stats['75%'] - stats['25%']
        stats['kurtosis'] = series.kurt()
        stats['skewness'] = series.skew()
        stats['sum'] = series.sum()
        stats['mad'] = series.mad()
        stats['cv'] = stats['std'] / stats['mean'] if stats['mean'] else np.NaN
        stats['type'] = "NUM"
        stats['n_zeros'] = (len(series) - np.count_nonzero(series))
        stats['p_zeros'] = stats['n_zeros'] / len(series)
        # Histograms
        stats['histogram'] = histogram(series)
        stats['mini_histogram'] = mini_histogram(series)
        return pd.Series(stats, name=series.name)

    def _plot_histogram(series, figsize=(6, 4), facecolor='#337ab7', bins=bins):
        """Plot an histogram from the data and return the AxesSubplot object.

        Parameters
        ----------
        series: Series, default None
            The data to plot
        figsize: a tuple (width, height) in inches, default (6,4)
            The size of the figure.
        facecolor: str
            The color code.
        bins: int, default
            The number of equal-width bins in the given range.

        Returns
        -------
        matplotlib.AxesSubplot, The plot.
        """
        if com.is_datetime64_dtype(series):
            # TODO: These calls should be merged
            fig = plt.figure(figsize=figsize)
            plot = fig.add_subplot(111)
            plot.set_ylabel('Frequency')
            try:
                plot.hist(series.values, facecolor=facecolor, bins=bins)
            except TypeError: # matplotlib 1.4 can't plot dates so will show empty plot instead
                pass
        else:
            plot = series.plot(kind='hist', figsize=figsize,
                               facecolor=facecolor,
                               bins=bins)  # TODO when running on server, send this off to a different thread
        return plot

    def histogram(series):
        """Plot an histogram of the data.

        Parameters
        ----------
        series: Series, default None
            The data to plot.

        Returns
        -------
        str, The resulting image encoded as a string.
        """
        imgdata = BytesIO()
        plot = _plot_histogram(series)
        plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0)
        plot.figure.savefig(imgdata)
        imgdata.seek(0)
        result_string = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue()))
        # TODO Think about writing this to disk instead of caching them in strings
        plt.close(plot.figure)
        return result_string

    def mini_histogram(series):
        """Plot a small (mini) histogram of the data.

        Parameters
        ----------
        series: Series, default None
            The data to plot.

        Returns
        -------
        str, The resulting image encoded as a string.
        """
        imgdata = BytesIO()
        plot = _plot_histogram(series, figsize=(2, 0.75))
        plot.axes.get_yaxis().set_visible(False)
        plot.set_axis_bgcolor("w")
        xticks = plot.xaxis.get_major_ticks()
        for tick in xticks[1:-1]:
            tick.set_visible(False)
            tick.label.set_visible(False)
        for tick in (xticks[0], xticks[-1]):
            tick.label.set_fontsize(8)
        plot.figure.subplots_adjust(left=0.15, right=0.85, top=1, bottom=0.35, wspace=0, hspace=0)
        plot.figure.savefig(imgdata)
        imgdata.seek(0)
        result_string = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue()))
        plt.close(plot.figure)
        return result_string

    def describe_date_1d(series, base_stats):
        stats = {'min': series.min(), 'max': series.max()}
        stats['range'] = stats['max'] - stats['min']
        stats['type'] = "DATE"
        stats['histogram'] = histogram(series)
        stats['mini_histogram'] = mini_histogram(series)
        return pd.Series(stats, name=series.name)

    def describe_categorical_1d(data):
        # Only run if at least 1 non-missing value
        objcounts = data.value_counts()
        top, freq = objcounts.index[0], objcounts.iloc[0]
        names = []
        result = []

        if data.dtype == object or com.is_categorical_dtype(data.dtype):
            names += ['top', 'freq']
            result += [top, freq]

        return pd.Series(result, index=names, name=data.name)

    def describe_constant_1d(data):
        return pd.Series(['CONST'], index=['type'], name=data.name)

    def describe_unique_1d(data):
        return pd.Series(['UNIQUE'], index=['type'], name=data.name)

    def describe_1d(data):
        leng = len(data)  # number of observations in the Series
        count = data.count()  # number of non-NaN observations in the Series

        # Replace infinite values with NaNs to avoid issues with
        # histograms later.
        data.replace(to_replace=[np.inf, np.NINF, np.PINF], value=np.nan, inplace=True)

        n_infinite = count - data.count()  # number of infinte observations in the Series
        
        distinct_count = data.nunique(dropna=False)  # number of unique elements in the Series
        if count > distinct_count > 1:
            mode = data.mode().iloc[0]
        else:
            mode = data[0]

        results_data = {'count': count,
                        'distinct_count': distinct_count,
                        'p_missing': 1 - count / leng,
                        'n_missing': leng - count,
                        'p_infinite': n_infinite / leng,
                        'n_infinite': n_infinite,
                        'is_unique': distinct_count == leng,
                        'mode': mode,
                        'p_unique': distinct_count / count}
        try:
            # pandas 0.17 onwards
            results_data['memorysize'] = data.memory_usage()
        except:
            results_data['memorysize'] = 0

        result = pd.Series(results_data, name=data.name)

        if distinct_count <= 1:
            result = result.append(describe_constant_1d(data))
        elif com.is_numeric_dtype(data):
            result = result.append(describe_numeric_1d(data, result))
        elif com.is_datetime64_dtype(data):
            result = result.append(describe_date_1d(data, result))
        elif distinct_count == leng:
            result = result.append(describe_unique_1d(data))
        else:
            result = result.append(describe_categorical_1d(data))
            result['type'] = 'CAT'

        return result

    if not pd.Index(np.arange(0, len(df))).equals(df.index):
        # Treat index as any other column
        df = df.reset_index()

    # Describe all variables in a univariate way
    pool = multiprocessing.Pool(pool_size)
    local_multiprocess_func = partial(multiprocess_func, **kwargs)
    ldesc = {col: s for col, s in pool.map(local_multiprocess_func, df.iteritems())}

    # Check correlations between variables
    ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9
    If x~y and y~z but not x~z, it would be better to delete only y
    Better way would be to find out which variable causes the highest increase in multicollinearity.
    '''
    corr = df.corr()
    for x, corr_x in corr.iterrows():
        if correlation_overrides and x in correlation_overrides:
            continue

        for y, corr in corr_x.iteritems():
            if x == y: break

            if corr > 0.9:
                ldesc[x] = pd.Series(['CORR', y, corr], index=['type', 'correlation_var', 'correlation'], name=x)

    # Convert ldesc to a DataFrame
    names = []
    ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)
    variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
    variable_stats.columns.names = df.columns.names

    # General statistics
    table_stats = {'n': len(df), 'nvar': len(df.columns)}
    table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (table_stats['n'] * table_stats['nvar'])
    table_stats['n_duplicates'] = sum(df.duplicated())

    memsize = df.memory_usage(index=True).sum()
    table_stats['memsize'] = formatters.fmt_bytesize(memsize)
    table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n'])

    table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR")})
    table_stats.update(dict(variable_stats.loc['type'].value_counts()))
    table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR']

    return {'table': table_stats, 'variables': variable_stats.T, 'freq': {k: df[k].value_counts() for k in df.columns}}