def _base_stats(ds): df = odo(bz.by(ds.district, sum_price=bz.sum(ds.price), sum_area=bz.sum(ds.area), count=bz.count(ds.price)), pd.DataFrame) df["avg_area"] = df["sum_area"] / df["count"] df["avg_price"] = df["sum_price"] / df["count"] df["avg_price_m2"] = df["sum_price"] / df["sum_area"] return df
def test_count(self, data): from blaze import count assert compute(count(t['amount']), data) == len(x['amount'])
def groupmeans(data, groups, numbers, cutoff=0.01, quantile=0.95, minsize=None): """ Yields the significant differences in average between every pair of groups and numbers. Parameters ---------- data : blaze data object groups : non-empty iterable containing category column names in data numbers : non-empty iterable containing numeric column names in data cutoff : ignore anything with prob > cutoff. cutoff=None ignores significance checks, speeding it up a LOT. quantile : number that represents target improvement. Defaults to .95. The ``diff`` returned is the % impact of everyone moving to the 95th percentile minsize : each group should contain at least minsize values. If minsize=None, automatically set the minimum size to 1% of the dataset, or 10, whichever is larger. """ if minsize is None: minsize = max(data.nrows / 100, 10) means = {col: data[col].mean() for col in numbers} results = [] for group in groups: agg = {number: bz.mean(data[number]) for number in numbers} agg["#"] = bz.count(data) ave = bz.by(data[group], **agg).sort("#", ascending=False) ave = bz.into(pd.DataFrame, ave) ave.index = ave[group] sizes = ave["#"] # Each group should contain at least minsize values biggies = sizes[sizes >= minsize].index # ... and at least 2 groups overall, to compare. if len(biggies) < 2: continue for number in numbers: if number == group: continue sorted_cats = ave[number][biggies].dropna().sort_values() if len(sorted_cats) < 2: continue lo = bz.into(list, data[number][data[group] == sorted_cats.index[0]]) hi = bz.into(list, data[number][data[group] == sorted_cats.index[-1]]) _, prob = ttest_ind(np.ma.masked_array(lo, np.isnan(lo)), np.ma.masked_array(hi, np.isnan(hi))) if prob > cutoff: continue results.append( { "group": group, "number": number, "prob": prob, "gain": (sorted_cats.iloc[-1] / means[number] - 1)[0], "biggies": ave.ix[biggies][number], "means": ave[[number, "#"]].sort_values(by=number), } ) results = pd.DataFrame(results) if len(results) > 0: results = results.set_index(["group", "number"]) return results
def crosstabs(data, columns=None, values=None, correction=False, pairs_top=10000, details=True): ''' Identifies the strength of relationship between every pair of categorical columns in a DataFrame Parameters ---------- data : Blaze data A data with at least 2 columns having categorical values. columns : list of column names in data If not specified, uses ``autolyse.types(data)['groups']`` to identify all columns with categorical data. values : str, column name Optional column that contains weights to aggregate by summing up. By default, each row is counted as an observation. correction : boolean If True, and the degrees of freedom is 1, apply Yates' correction for continuity. The effect of the correction is to adjust each observed value by 0.5 towards the corresponding expected value. Defaults to False since Cramer's V (a more useful metric than chi-squared) must be computed without this correction. pairs_top: integer, Pick only top 10000 pairs by default details: boolean If True, will return observed and expected dataframes for pairs. Defaults to False. ''' if columns is None: columns = types(data)['groups'] parameters = ('p', 'chi2', 'dof', 'V') for index, column in itertools.combinations(columns, 2): agg_col = values if values in data.fields else column agg_func = bz.count(data[agg_col]) if agg_col == column else bz.sum(data[agg_col]) data_grouped = bz.into(pd.DataFrame, bz.by(bz.merge(data[index], data[column]), values=agg_func) .sort('values') # Generated SQL inefficient .head(pairs_top)) # BUG: bz.count: non-null count, gives 0 count for NULL groups # .nrows needs to fixed blaze/issues/1484 # For now, we'll ignore NULL groups # Remove NULL groups data_grouped = data_grouped.dropna() if data_grouped.empty: result = {(index, column): {}} else: r = _crosstab(data_grouped[index], column=data_grouped[column], values=data_grouped['values'], correction=correction) if details: result = { 'index': index, 'column': column, 'observed': r['observed'].to_json(), 'expected': r['expected'].to_json(), 'stats': {param: r[param] for param in parameters} } else: result = { 'index': index, 'column': column, 'stats': {param: r[param] for param in parameters} } yield result