Esempio n. 1
0
def _base_stats(ds):
    df = odo(bz.by(ds.district,
            sum_price=bz.sum(ds.price),
            sum_area=bz.sum(ds.area),
            count=bz.count(ds.price)),
        pd.DataFrame)

    df["avg_area"] = df["sum_area"] / df["count"]
    df["avg_price"] = df["sum_price"] / df["count"]
    df["avg_price_m2"] = df["sum_price"] / df["sum_area"]

    return df
 def test_count(self, data):
     from blaze import count
     assert compute(count(t['amount']), data) == len(x['amount'])
 def test_count(self, data):
     from blaze import count
     assert compute(count(t['amount']), data) == len(x['amount'])
Esempio n. 4
0
def groupmeans(data, groups, numbers, cutoff=0.01, quantile=0.95, minsize=None):
    """
    Yields the significant differences in average between every pair of
    groups and numbers.

    Parameters
    ----------
    data : blaze data object
    groups : non-empty iterable containing category column names in data
    numbers : non-empty iterable containing numeric column names in data
    cutoff : ignore anything with prob > cutoff.
        cutoff=None ignores significance checks, speeding it up a LOT.
    quantile : number that represents target improvement. Defaults to .95.
        The ``diff`` returned is the % impact of everyone moving to the 95th
        percentile
    minsize : each group should contain at least minsize values.
        If minsize=None, automatically set the minimum size to
        1% of the dataset, or 10, whichever is larger.
    """

    if minsize is None:
        minsize = max(data.nrows / 100, 10)

    means = {col: data[col].mean() for col in numbers}
    results = []

    for group in groups:
        agg = {number: bz.mean(data[number]) for number in numbers}
        agg["#"] = bz.count(data)
        ave = bz.by(data[group], **agg).sort("#", ascending=False)
        ave = bz.into(pd.DataFrame, ave)
        ave.index = ave[group]
        sizes = ave["#"]

        # Each group should contain at least minsize values
        biggies = sizes[sizes >= minsize].index
        # ... and at least 2 groups overall, to compare.
        if len(biggies) < 2:
            continue
        for number in numbers:
            if number == group:
                continue
            sorted_cats = ave[number][biggies].dropna().sort_values()
            if len(sorted_cats) < 2:
                continue
            lo = bz.into(list, data[number][data[group] == sorted_cats.index[0]])
            hi = bz.into(list, data[number][data[group] == sorted_cats.index[-1]])
            _, prob = ttest_ind(np.ma.masked_array(lo, np.isnan(lo)), np.ma.masked_array(hi, np.isnan(hi)))
            if prob > cutoff:
                continue
            results.append(
                {
                    "group": group,
                    "number": number,
                    "prob": prob,
                    "gain": (sorted_cats.iloc[-1] / means[number] - 1)[0],
                    "biggies": ave.ix[biggies][number],
                    "means": ave[[number, "#"]].sort_values(by=number),
                }
            )

    results = pd.DataFrame(results)
    if len(results) > 0:
        results = results.set_index(["group", "number"])
    return results
Esempio n. 5
0
def crosstabs(data, columns=None, values=None,
              correction=False,
              pairs_top=10000,
              details=True):
    '''
    Identifies the strength of relationship between every pair of categorical
    columns in a DataFrame

    Parameters
    ----------
    data : Blaze data
        A data with at least 2 columns having categorical values.
    columns : list of column names in data
        If not specified, uses ``autolyse.types(data)['groups']`` to identify
        all columns with categorical data.
    values : str, column name
        Optional column that contains weights to aggregate by summing up. By
        default, each row is counted as an observation.
    correction : boolean
        If True, and the degrees of freedom is 1, apply Yates' correction for
        continuity. The effect of the correction is to adjust each observed
        value by 0.5 towards the corresponding expected value. Defaults to False
        since Cramer's V (a more useful metric than chi-squared) must be computed
        without this correction.
    pairs_top: integer, Pick only top 10000 pairs by default
    details: boolean
        If True, will return observed and expected dataframes for pairs.
        Defaults to False.
    '''
    if columns is None:
        columns = types(data)['groups']

    parameters = ('p', 'chi2', 'dof', 'V')
    for index, column in itertools.combinations(columns, 2):
        agg_col = values if values in data.fields else column
        agg_func = bz.count(data[agg_col]) if agg_col == column else bz.sum(data[agg_col])
        data_grouped = bz.into(pd.DataFrame,
                               bz.by(bz.merge(data[index], data[column]),
                                     values=agg_func)
                               .sort('values')  # Generated SQL inefficient
                               .head(pairs_top))
        # BUG: bz.count: non-null count, gives 0 count for NULL groups
        # .nrows needs to fixed blaze/issues/1484
        # For now, we'll ignore NULL groups
        # Remove NULL groups
        data_grouped = data_grouped.dropna()
        if data_grouped.empty:
            result = {(index, column): {}}
        else:
            r = _crosstab(data_grouped[index],
                          column=data_grouped[column],
                          values=data_grouped['values'],
                          correction=correction)
            if details:
                result = {
                        'index': index,
                        'column': column,
                        'observed': r['observed'].to_json(),
                        'expected': r['expected'].to_json(),
                        'stats': {param: r[param] for param in parameters}
                }
            else:
                result = {
                        'index': index,
                        'column': column,
                        'stats': {param: r[param] for param in parameters}
                }

        yield result
Esempio n. 6
0
def crosstabs(data, columns=None, values=None,
              correction=False,
              pairs_top=10000,
              details=True):
    '''
    Identifies the strength of relationship between every pair of categorical
    columns in a DataFrame

    Parameters
    ----------
    data : Blaze data
        A data with at least 2 columns having categorical values.
    columns : list of column names in data
        If not specified, uses ``autolyse.types(data)['groups']`` to identify
        all columns with categorical data.
    values : str, column name
        Optional column that contains weights to aggregate by summing up. By
        default, each row is counted as an observation.
    correction : boolean
        If True, and the degrees of freedom is 1, apply Yates' correction for
        continuity. The effect of the correction is to adjust each observed
        value by 0.5 towards the corresponding expected value. Defaults to False
        since Cramer's V (a more useful metric than chi-squared) must be computed
        without this correction.
    pairs_top: integer, Pick only top 10000 pairs by default
    details: boolean
        If True, will return observed and expected dataframes for pairs.
        Defaults to False.
    '''
    if columns is None:
        columns = types(data)['groups']

    parameters = ('p', 'chi2', 'dof', 'V')
    for index, column in itertools.combinations(columns, 2):
        agg_col = values if values in data.fields else column
        agg_func = bz.count(data[agg_col]) if agg_col == column else bz.sum(data[agg_col])
        data_grouped = bz.into(pd.DataFrame,
                               bz.by(bz.merge(data[index], data[column]),
                                     values=agg_func)
                               .sort('values')  # Generated SQL inefficient
                               .head(pairs_top))
        # BUG: bz.count: non-null count, gives 0 count for NULL groups
        # .nrows needs to fixed blaze/issues/1484
        # For now, we'll ignore NULL groups
        # Remove NULL groups
        data_grouped = data_grouped.dropna()
        if data_grouped.empty:
            result = {(index, column): {}}
        else:
            r = _crosstab(data_grouped[index],
                          column=data_grouped[column],
                          values=data_grouped['values'],
                          correction=correction)
            if details:
                result = {
                        'index': index,
                        'column': column,
                        'observed': r['observed'].to_json(),
                        'expected': r['expected'].to_json(),
                        'stats': {param: r[param] for param in parameters}
                }
            else:
                result = {
                        'index': index,
                        'column': column,
                        'stats': {param: r[param] for param in parameters}
                }

        yield result