def plot_number_var(ax, arr, color=DEFAULT_COLOR, label=None, alpha=1.):
    """Plots an histogram into an matplotlib axe.

    Parameters
    ----------
    ax: plt.axes.Axes
        axe where to add the plot 
    arr: array like
        Array of number values
    color: str (default DEFAULT_COLOR)
        color of the plot
    label: str (default None)
        label of the plot
    alpha: float (default 1.)
        opacity

    Raises
    ------
    TypeError:
        arr is not an array like
    TypeError:
        arr is not a number array
    """
    if not utils.is_array_like(arr):
        raise TypeError('arr is not an array like')
    if utils.find_dtype(arr) != 'number':
        raise TypeError('arr is not a number array')

    ax.hist(arr, bins=50, color=color, label=label, alpha=alpha)
Esempio n. 2
0
def init_bias_module(df, dataset):
    """
    """
    module = select_from_db(ModuleBias, 'dataset_id', dataset.id)
    update_in_db(module, {'status': 'loading'})

    y_true = df[dataset.target]
    y_pred = df[dataset.score]

    privileged_group = {}
    for attr in dataset.protected_attr:
        privileged_group[attr] = {}
        dtype = utils.find_dtype(df[attr], len_sample=1000)
        uniq_values = list(df[attr].value_counts().index)

        if dtype != 'object':
            if len(uniq_values) > 10:
                uniq_values = [round(np.mean(uniq_values), 2)]
            else:
                dtype = 'object'
                uniq_values = list(df[attr].value_counts().index)

        privileged_group[attr]['dtype'] = dtype
        privileged_group[attr]['values'] = uniq_values

    data = {'status': 'loaded', 'privileged_group': privileged_group}

    try:
        res = update_in_db(module, data)

        if res != 'updated':
            update_in_db(module, {'status': 'failed'})

    except:
        update_in_db(module, {'status': 'failed'})
def plot_datetime_var(ax, arr, color=DEFAULT_COLOR, label=None, alpha=1.):
    """Plots a line plot into an matplotlib axe.

    Parameters
    ----------
    ax: plt.axes.Axes
        axe where to add the plot 
    arr: array like
        Array of datetime values
    color: str (default DEFAULT_COLOR)
        color of the plot
    label: str (default None)
        label of the plot
    alpha: float (default 1.)
        opacity

    Raises
    ------
    TypeError:
        arr is not an array like
    TypeError:
        arr is not a datetime array
    """
    if not utils.is_array_like(arr):
        raise TypeError('arr is not an array like')
    if utils.find_dtype(arr) != 'datetime':
        raise TypeError('arr is not a datetime array')

    arr = pd.to_datetime(arr, errors='coerce')

    date_min = arr.min()
    date_max = arr.max()
    gap = (date_max - date_min).days

    if gap > 1500:
        arr = arr.dt.year.astype(str)
    elif gap > 100:
        arr = arr.dt.strftime('%Y-%m')
    elif gap > 5:
        arr = arr.dt.strftime('%Y-%m-%d')
    else:
        arr = arr.dt.strftime('%Y-%m-%d-%r')

    v_c = arr.value_counts().sort_index()
    dates = mdates.num2date(mdates.datestr2num(v_c.index))
    y = v_c.values

    ax.plot(dates, y, color=color, label=label)
    ax.fill_between(dates, 0, y, color=color, alpha=alpha)
Esempio n. 4
0
def describe_datetime(arr, format='%Y-%m-%d'):
    """Descriptive statistics about a datetime array.

    Returned statistics:

    - Count of valid values
    - Count of missing values
    - Count of unique values
    - Most common value
    - Min
    - Mean
    - Max

    Parameters
    ----------
    arr: array like
        Array of value to get desriptive statistics from
    format: str
        String format for datetime value

    Raises
    ------
    TypeError:
        arr is not an array like
    TypeError:
        arr is not a datetime array
    """
    if not utils.is_array_like(arr):
        raise TypeError('arr is not an array like')
    if utils.find_dtype(arr) != 'datetime':
        raise TypeError('arr is not a datetime array')

    if type(arr) in [list, np.ndarray]:
        arr = pd.Series(arr)

    arr = pd.to_datetime(arr, errors='coerce')

    desc = describe_common(arr)

    desc['unique values'] = arr.nunique()
    desc['most common'] = arr.mode()[0].strftime(format)
    desc['min'] = arr.min().strftime(format)
    desc['mean'] = arr.mean().strftime(format)
    desc['max'] = arr.max().strftime(format)

    return desc
Esempio n. 5
0
 def test_find_dtype(self):
     self.assertRaises(TypeError, utils.find_dtype)
     self.assertEqual(utils.find_dtype([1, 2]), 'number')
     self.assertEqual(utils.find_dtype(['1', '2']), 'object')
     self.assertEqual(
         utils.find_dtype([
             datetime.datetime(1958, 5, 12),
             datetime.datetime(1980, 12, 12)
         ]), 'datetime')
     self.assertEqual(utils.find_dtype(['blabla', '2']), 'object')
     self.assertEqual(utils.find_dtype(pd.DataFrame([1, 2])), 'number')
     self.assertEqual(utils.find_dtype(pd.Series(['1', '2'])), 'object')
     self.assertEqual(
         utils.find_dtype(
             pd.Series([
                 datetime.datetime(1958, 5, 12),
                 datetime.datetime(1980, 12, 12)
             ])), 'datetime')
     self.assertEqual(utils.find_dtype(pd.DataFrame(['blabla', '2'])),
                      'object')
Esempio n. 6
0
def describe_number(arr):
    """Descriptive statistics about a number array.

    Returned statistics:

    - Count of valid values
    - Count of missing values
    - Mean
    - Mode
    - Min
    - Quantitle 25%
    - Median
    - Quantile 75%
    - Max

    Parameters
    ----------
    arr: array like
        Array of value to get desriptive statistics from

    Raises
    ------
    TypeError:
        arr is not an array like
    TypeError:
        arr is not a number array
    """
    if not utils.is_array_like(arr):
        raise TypeError('arr is not an array like')
    if utils.find_dtype(arr) != 'number':
        raise TypeError('arr is not a number array')

    desc = describe_common(arr)

    desc['mean'] = np.round(np.mean(arr), 4)
    desc['mode'] = stats.mode(arr)[0][0]
    desc['min'] = np.min(arr)
    desc['quantile 25%'] = np.quantile(arr, 0.25)
    desc['quantile 50%'] = np.median(arr)
    desc['quantile 75%'] = np.quantile(arr, 0.75)
    desc['max'] = np.max(arr)

    return desc
def plot_object_var(ax,
                    arr,
                    top=10,
                    color=DEFAULT_COLOR,
                    label=None,
                    alpha=1.):
    """Plots a bar plot into an matplotlib axe.

    Parameters
    ----------
    ax: plt.axes.Axes
        axe where to add the plot 
    arr: array like
        Array of object values
    color: str (default DEFAULT_COLOR)
        color of the plot
    label: str (default None)
        label of the plot
    alpha: float (default 1.)
        opacity

    Raises
    ------
    TypeError:
        arr is not an array like
    TypeError:
        arr is not a object array
    """
    if not utils.is_array_like(arr):
        raise TypeError('arr is not an array like')
    if utils.find_dtype(arr) != 'object':
        raise TypeError('arr is not an object array')

    if type(arr) in [list, np.ndarray]:
        arr = pd.Series(arr)

    v_c = arr.value_counts().sort_values(ascending=False)

    v_c = v_c if len(v_c) <= top else v_c[:top]
    x, y = v_c.index, v_c.values

    bar = ax.bar(x, y, color=color, label=label, alpha=alpha)
Esempio n. 8
0
def describe_object(arr):
    """Descriptive statistics about an object array.

    Returned statistics:

    - Count of valid values
    - Count of missing values
    - Count of unique values
    - Most common value

    Parameters
    ----------
    arr: array like
        Array of value to get desriptive statistics from

    Raises
    ------
    TypeError:
        arr is not an array like
    TypeError:
        arr is not an object array
    """
    if not utils.is_array_like(arr):
        raise TypeError('arr is not an array like')
    if utils.find_dtype(arr) != 'object':
        raise TypeError('arr is not an object array')

    if type(arr) in [list, type(np.array([]))]:
        arr = pd.Series(arr)

    desc = describe_common(arr)

    desc['unique values'] = arr.nunique()
    desc['most common'] = arr.mode()[0]

    return desc
def plot_variable(arr,
                  legend=None,
                  colors=None,
                  xlog=False,
                  ylog=False,
                  **kwargs):
    """Plots a graph with two parts given an array.
    First part is the plot custom plot depending on the array dtype.
    Second part is the describe statistics table.

    First plot is:

    - Histogram if dtype is number (using plot_number_var)
    - Line plot if dtype is datetime (using plot_datetime_var)
    - Bar plot  if dtype is object (using plot_object_var)

    If legend array is set then automaticly plots differents values.

    Parameters
    ----------
    arr: array like
        Array of values to plots
    legend: array like (default None)
        Array of values of legend (same length than arr)
    colors: list (default None)
        Array of colors, used if legend is set
    xlog: bool (default False)
        Scale xaxis in log scale
    ylog: bool (default False)
        Scale yaxis in log scale

    Raises
    ------
    TypeError:
        arr is not an array like
    TypeError:
        legend is not an array like
    ValueError:
         arr and legend have not the same length
    """
    if not utils.is_array_like(arr):
        raise TypeError('arr is not an array like')
    if (legend is not None) & (not utils.is_array_like(legend)):
        raise TypeError('legend is not an array like')
    if legend is not None:
        if len(arr) != len(legend):
            raise ValueError('arr and legend have not the same length')

    name = ''
    if type(arr) == pd.Series:
        name = arr.name
    elif type(arr) == pd.DataFrame:
        name = arr.columns[0]
    elif type(arr) in [list, np.ndarray]:
        arr = pd.Series(arr)

    if (legend is not None) & (colors is None):
        colors = [
            '#3498db', '#e67e22', '#2ecc71', '#f1c40f', '#9b59b6', '#e74c3c'
        ]

    if legend is not None:
        legend_name = ''
        if type(legend) == pd.Series:
            legend_name = legend.name
        elif type(legend) == pd.DataFrame:
            legend_name = legend.columns[0]

    dtype = utils.find_dtype(arr)
    desc = describe(arr)

    # Init figure
    fig = plt.figure(figsize=(15, 5), constrained_layout=False)
    gs = fig.add_gridspec(1, 12)

    # 2 axes : one for the plot, one for the stats
    ax1 = fig.add_subplot(gs[0, :8])
    ax2 = fig.add_subplot(gs[0, 8:])

    # format title
    title = 'Histogram' if dtype == 'number' else 'Plot'
    title = title if name is None else title + ' of ' + name
    title = title if legend is None else title + ' by ' + legend_name

    ax1.set_title(title, loc='center', fontsize=22)

    # Use plot depending on the dtype
    # Number : histogram
    if dtype == 'number':
        plot_fun = plot_number_var

    # Datetime : line plot
    elif dtype == 'datetime':
        plot_fun = plot_datetime_var
        fig.autofmt_xdate()

    # Object : bar plot
    else:
        plot_fun = plot_object_var
        plt.setp(ax1.xaxis.get_majorticklabels(), rotation=18)

    if legend is None:
        plot_fun(ax1, arr)
    else:
        for i, label in enumerate(set(list(legend))):
            arr_val = arr[legend == label]

            if len(arr_val) == 0:
                continue

            plot_fun(ax1,
                     arr_val,
                     color=colors[i % len(colors)],
                     label=label,
                     alpha=0.5)

    # If log is needed
    if xlog:
        ax1.set_xscale('log')
    if ylog:
        ax1.set_yscale('log')

    # put legend if it's necessary
    if legend is not None:
        ax1.legend(loc=0, frameon=True)

    # Add describe stats table
    desc_formated = utils.format_describe_str(desc)
    plot_table_describe(ax2, desc_formated)

    # plt.show()
    return plots.plot_or_figure(fig, **kwargs)
Esempio n. 10
0
def describe(arr):
    """Descriptive statistics about an array.
    Depending on the detected dtype (number, date, object)
    it returns specific stats.

    Common statistics for all dtype (using describe_common):

    - Count of valid values
    - Count of missing values

    Number statistics (using describe_number):

    - Mean
    - Mode
    - Min
    - Quantitle 25%
    - Median
    - Quantile 75%
    - Max

    Datetime statistics (using describe_datetime):

    - Count of unique values
    - Most common value
    - Min
    - Mean
    - Max

    Object statistics (using describe_datetime):

    - Count of unique values
    - Most common value

    Parameters
    ----------
    arr: array like
        Array of value to get desriptive statistics from

    Returns
    -------
    dict
        Dictionnary with descriptive statistics

    Raises
    ------
    TypeError:
        arr is not an array like
    """
    if not utils.is_array_like(arr):
        raise TypeError('arr is not an array like')

    if type(arr) == list:
        arr = np.array(arr)
    if type(arr) in [pd.Series, pd.DataFrame]:
        arr = arr.to_numpy()

    dtype = utils.find_dtype(arr)

    if dtype == 'number':
        return describe_number(arr)
    elif dtype == 'object':
        return describe_object(arr)
    elif dtype == 'datetime':
        return describe_datetime(arr)

    return None