def _base_stats(ds): df = odo(bz.by(ds.district, sum_price=bz.sum(ds.price), sum_area=bz.sum(ds.area), count=bz.count(ds.price)), pd.DataFrame) df["avg_area"] = df["sum_area"] / df["count"] df["avg_price"] = df["sum_price"] / df["count"] df["avg_price_m2"] = df["sum_price"] / df["sum_area"] return df
def test_sum_zerosize(self): # Empty sum operations should produce 0, the reduction identity self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([])).ddesc), 0) self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([], keepdims=True)).ddesc), [0]) self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[], []])).ddesc), 0) self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[], []], keepdims=True)).ddesc), [[0]]) self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[], []], axis=-1)).ddesc), [0, 0]) self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[], []], axis=-1, keepdims=True)).ddesc), [[0], [0]]) # If we're only reducing on a non-empty dimension, we might still # end up with zero-sized outputs self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[], []], axis=0)).ddesc), []) self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[], []], axis=0, keepdims=True)).ddesc), [[]])
def test_operations(datashape): a = make_test_array(datashape) b = make_test_array(datashape) print('a:\n', a) print('b:\n', b) print('a + b:\n', a + b) print('a - b:\n', a - b) print('a * b:\n', a * b) print('a / b:\n', a / b) print('blaze.max(a):\n', blaze.max(a)) print('blaze.min(a):\n', blaze.min(a)) print('blaze.product(a):\n', blaze.product(a)) print('blaze.sum(a):\n', blaze.sum(a))
def test_sum_zerosize(self): # Empty sum operations should produce 0, the reduction identity self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([])).ddesc), 0) self.assertEqual( ddesc_as_py(blaze.eval(blaze.sum([], keepdims=True)).ddesc), [0]) self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[], []])).ddesc), 0) self.assertEqual( ddesc_as_py(blaze.eval(blaze.sum([[], []], keepdims=True)).ddesc), [[0]]) self.assertEqual( ddesc_as_py(blaze.eval(blaze.sum([[], []], axis=-1)).ddesc), [0, 0]) self.assertEqual( ddesc_as_py( blaze.eval(blaze.sum([[], []], axis=-1, keepdims=True)).ddesc), [[0], [0]]) # If we're only reducing on a non-empty dimension, we might still # end up with zero-sized outputs self.assertEqual( ddesc_as_py(blaze.eval(blaze.sum([[], []], axis=0)).ddesc), []) self.assertEqual( ddesc_as_py( blaze.eval(blaze.sum([[], []], axis=0, keepdims=True)).ddesc), [[]])
def test_sum(self, data): from blaze import sum assert compute(sum(t['amount']), data) == x['amount'].sum()
def test_sum(self): # Sum of scalar case is the element itself self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum(10)).ddesc), 10) self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum(-5.0)).ddesc), -5.0) # One-dimensional size one self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([10])).ddesc), 10) self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([-5.0])).ddesc), -5.0) self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([-5.0], axis=0)).ddesc), -5.0) self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([10], keepdims=True)).ddesc), [10]) # One dimensional self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([1, 2])).ddesc), 3) self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([0, 1, 2])).ddesc), 3) # Two dimensional self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[1, 2, 3], [4, 5, 6]])).ddesc), 21) # Two dimensional, with axis= argument both positive and negative self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[1, 5, 3], [4, 2, 6]], axis=0)).ddesc), [5, 7, 9]) self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[1, 5, 3], [4, 2, 6]], axis=-2)).ddesc), [5, 7, 9]) self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[1, 2, 3], [4, 5, 6]], axis=1)).ddesc), [6, 15]) self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[1, 2, 3], [4, 5, 6]], axis=-1)).ddesc), [6, 15]) # Two dimensional, with keepdims=True self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[1, 2, 3], [4, 5, 6]], keepdims=True)).ddesc), [[21]]) self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[1, 2, 3], [5, 4, 6]], axis=0, keepdims=True)).ddesc), [[6, 6, 9]]) self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[1, 5, 3], [4, 2, 6]], axis=1, keepdims=True)).ddesc), [[9], [12]])
def test_sum(self): # Sum of scalar case is the element itself self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum(10)).ddesc), 10) self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum(-5.0)).ddesc), -5.0) # One-dimensional size one self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([10])).ddesc), 10) self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([-5.0])).ddesc), -5.0) self.assertEqual( ddesc_as_py(blaze.eval(blaze.sum([-5.0], axis=0)).ddesc), -5.0) self.assertEqual( ddesc_as_py(blaze.eval(blaze.sum([10], keepdims=True)).ddesc), [10]) # One dimensional self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([1, 2])).ddesc), 3) self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([0, 1, 2])).ddesc), 3) # Two dimensional self.assertEqual( ddesc_as_py(blaze.eval(blaze.sum([[1, 2, 3], [4, 5, 6]])).ddesc), 21) # Two dimensional, with axis= argument both positive and negative self.assertEqual( ddesc_as_py( blaze.eval(blaze.sum([[1, 5, 3], [4, 2, 6]], axis=0)).ddesc), [5, 7, 9]) self.assertEqual( ddesc_as_py( blaze.eval(blaze.sum([[1, 5, 3], [4, 2, 6]], axis=-2)).ddesc), [5, 7, 9]) self.assertEqual( ddesc_as_py( blaze.eval(blaze.sum([[1, 2, 3], [4, 5, 6]], axis=1)).ddesc), [6, 15]) self.assertEqual( ddesc_as_py( blaze.eval(blaze.sum([[1, 2, 3], [4, 5, 6]], axis=-1)).ddesc), [6, 15]) # Two dimensional, with keepdims=True self.assertEqual( ddesc_as_py( blaze.eval(blaze.sum([[1, 2, 3], [4, 5, 6]], keepdims=True)).ddesc), [[21]]) self.assertEqual( ddesc_as_py( blaze.eval( blaze.sum([[1, 2, 3], [5, 4, 6]], axis=0, keepdims=True)).ddesc), [[6, 6, 9]]) self.assertEqual( ddesc_as_py( blaze.eval( blaze.sum([[1, 5, 3], [4, 2, 6]], axis=1, keepdims=True)).ddesc), [[9], [12]])
def crosstabs(data, columns=None, values=None, correction=False, pairs_top=10000, details=True): ''' Identifies the strength of relationship between every pair of categorical columns in a DataFrame Parameters ---------- data : Blaze data A data with at least 2 columns having categorical values. columns : list of column names in data If not specified, uses ``autolyse.types(data)['groups']`` to identify all columns with categorical data. values : str, column name Optional column that contains weights to aggregate by summing up. By default, each row is counted as an observation. correction : boolean If True, and the degrees of freedom is 1, apply Yates' correction for continuity. The effect of the correction is to adjust each observed value by 0.5 towards the corresponding expected value. Defaults to False since Cramer's V (a more useful metric than chi-squared) must be computed without this correction. pairs_top: integer, Pick only top 10000 pairs by default details: boolean If True, will return observed and expected dataframes for pairs. Defaults to False. ''' if columns is None: columns = types(data)['groups'] parameters = ('p', 'chi2', 'dof', 'V') for index, column in itertools.combinations(columns, 2): agg_col = values if values in data.fields else column agg_func = bz.count(data[agg_col]) if agg_col == column else bz.sum(data[agg_col]) data_grouped = bz.into(pd.DataFrame, bz.by(bz.merge(data[index], data[column]), values=agg_func) .sort('values') # Generated SQL inefficient .head(pairs_top)) # BUG: bz.count: non-null count, gives 0 count for NULL groups # .nrows needs to fixed blaze/issues/1484 # For now, we'll ignore NULL groups # Remove NULL groups data_grouped = data_grouped.dropna() if data_grouped.empty: result = {(index, column): {}} else: r = _crosstab(data_grouped[index], column=data_grouped[column], values=data_grouped['values'], correction=correction) if details: result = { 'index': index, 'column': column, 'observed': r['observed'].to_json(), 'expected': r['expected'].to_json(), 'stats': {param: r[param] for param in parameters} } else: result = { 'index': index, 'column': column, 'stats': {param: r[param] for param in parameters} } yield result