Example #1
0
def _base_stats(ds):
    df = odo(bz.by(ds.district,
            sum_price=bz.sum(ds.price),
            sum_area=bz.sum(ds.area),
            count=bz.count(ds.price)),
        pd.DataFrame)

    df["avg_area"] = df["sum_area"] / df["count"]
    df["avg_price"] = df["sum_price"] / df["count"]
    df["avg_price_m2"] = df["sum_price"] / df["sum_area"]

    return df
 def test_sum_zerosize(self):
     # Empty sum operations should produce 0, the reduction identity
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([])).ddesc), 0)
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([],
                                                    keepdims=True)).ddesc),
                      [0])
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[], []])).ddesc), 0)
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[], []],
                                                    keepdims=True)).ddesc),
                      [[0]])
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[], []],
                                                    axis=-1)).ddesc),
                      [0, 0])
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[], []],
                                                         axis=-1,
                                                         keepdims=True)).ddesc),
                      [[0], [0]])
     # If we're only reducing on a non-empty dimension, we might still
     # end up with zero-sized outputs
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[], []],
                                                    axis=0)).ddesc),
                      [])
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[], []],
                                                    axis=0,
                                                    keepdims=True)).ddesc),
                      [[]])
Example #3
0
def test_operations(datashape):
    a = make_test_array(datashape)
    b = make_test_array(datashape)
    print('a:\n', a)
    print('b:\n', b)
    print('a + b:\n', a + b)
    print('a - b:\n', a - b)
    print('a * b:\n', a * b)
    print('a / b:\n', a / b)
    print('blaze.max(a):\n', blaze.max(a))
    print('blaze.min(a):\n', blaze.min(a))
    print('blaze.product(a):\n', blaze.product(a))
    print('blaze.sum(a):\n', blaze.sum(a))
 def test_sum_zerosize(self):
     # Empty sum operations should produce 0, the reduction identity
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([])).ddesc), 0)
     self.assertEqual(
         ddesc_as_py(blaze.eval(blaze.sum([], keepdims=True)).ddesc), [0])
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[], []])).ddesc), 0)
     self.assertEqual(
         ddesc_as_py(blaze.eval(blaze.sum([[], []], keepdims=True)).ddesc),
         [[0]])
     self.assertEqual(
         ddesc_as_py(blaze.eval(blaze.sum([[], []], axis=-1)).ddesc),
         [0, 0])
     self.assertEqual(
         ddesc_as_py(
             blaze.eval(blaze.sum([[], []], axis=-1, keepdims=True)).ddesc),
         [[0], [0]])
     # If we're only reducing on a non-empty dimension, we might still
     # end up with zero-sized outputs
     self.assertEqual(
         ddesc_as_py(blaze.eval(blaze.sum([[], []], axis=0)).ddesc), [])
     self.assertEqual(
         ddesc_as_py(
             blaze.eval(blaze.sum([[], []], axis=0, keepdims=True)).ddesc),
         [[]])
 def test_sum(self, data):
     from blaze import sum
     assert compute(sum(t['amount']), data) == x['amount'].sum()
 def test_sum(self, data):
     from blaze import sum
     assert compute(sum(t['amount']), data) == x['amount'].sum()
 def test_sum(self):
     # Sum of scalar case is the element itself
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum(10)).ddesc), 10)
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum(-5.0)).ddesc), -5.0)
     # One-dimensional size one
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([10])).ddesc), 10)
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([-5.0])).ddesc), -5.0)
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([-5.0],
                                                    axis=0)).ddesc), -5.0)
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([10],
                                                    keepdims=True)).ddesc),
                      [10])
     # One dimensional
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([1, 2])).ddesc), 3)
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([0, 1, 2])).ddesc), 3)
     # Two dimensional
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[1, 2, 3],
                                                     [4, 5, 6]])).ddesc), 21)
     # Two dimensional, with axis= argument both positive and negative
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[1, 5, 3],
                                                     [4, 2, 6]],
                                                    axis=0)).ddesc),
                      [5, 7, 9])
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[1, 5, 3],
                                                     [4, 2, 6]],
                                                    axis=-2)).ddesc),
                      [5, 7, 9])
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[1, 2, 3],
                                                     [4, 5, 6]],
                                                    axis=1)).ddesc),
                      [6, 15])
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[1, 2, 3],
                                                     [4, 5, 6]],
                                                    axis=-1)).ddesc),
                      [6, 15])
     # Two dimensional, with keepdims=True
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[1, 2, 3],
                                                     [4, 5, 6]],
                                                    keepdims=True)).ddesc),
                      [[21]])
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[1, 2, 3],
                                                     [5, 4, 6]],
                                                    axis=0,
                                                    keepdims=True)).ddesc),
                      [[6, 6, 9]])
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([[1, 5, 3],
                                                     [4, 2, 6]],
                                                    axis=1,
                                                    keepdims=True)).ddesc),
                      [[9], [12]])
 def test_sum(self):
     # Sum of scalar case is the element itself
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum(10)).ddesc), 10)
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum(-5.0)).ddesc), -5.0)
     # One-dimensional size one
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([10])).ddesc), 10)
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([-5.0])).ddesc),
                      -5.0)
     self.assertEqual(
         ddesc_as_py(blaze.eval(blaze.sum([-5.0], axis=0)).ddesc), -5.0)
     self.assertEqual(
         ddesc_as_py(blaze.eval(blaze.sum([10], keepdims=True)).ddesc),
         [10])
     # One dimensional
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([1, 2])).ddesc), 3)
     self.assertEqual(ddesc_as_py(blaze.eval(blaze.sum([0, 1, 2])).ddesc),
                      3)
     # Two dimensional
     self.assertEqual(
         ddesc_as_py(blaze.eval(blaze.sum([[1, 2, 3], [4, 5, 6]])).ddesc),
         21)
     # Two dimensional, with axis= argument both positive and negative
     self.assertEqual(
         ddesc_as_py(
             blaze.eval(blaze.sum([[1, 5, 3], [4, 2, 6]], axis=0)).ddesc),
         [5, 7, 9])
     self.assertEqual(
         ddesc_as_py(
             blaze.eval(blaze.sum([[1, 5, 3], [4, 2, 6]], axis=-2)).ddesc),
         [5, 7, 9])
     self.assertEqual(
         ddesc_as_py(
             blaze.eval(blaze.sum([[1, 2, 3], [4, 5, 6]], axis=1)).ddesc),
         [6, 15])
     self.assertEqual(
         ddesc_as_py(
             blaze.eval(blaze.sum([[1, 2, 3], [4, 5, 6]], axis=-1)).ddesc),
         [6, 15])
     # Two dimensional, with keepdims=True
     self.assertEqual(
         ddesc_as_py(
             blaze.eval(blaze.sum([[1, 2, 3], [4, 5, 6]],
                                  keepdims=True)).ddesc), [[21]])
     self.assertEqual(
         ddesc_as_py(
             blaze.eval(
                 blaze.sum([[1, 2, 3], [5, 4, 6]], axis=0,
                           keepdims=True)).ddesc), [[6, 6, 9]])
     self.assertEqual(
         ddesc_as_py(
             blaze.eval(
                 blaze.sum([[1, 5, 3], [4, 2, 6]], axis=1,
                           keepdims=True)).ddesc), [[9], [12]])
Example #9
0
def crosstabs(data, columns=None, values=None,
              correction=False,
              pairs_top=10000,
              details=True):
    '''
    Identifies the strength of relationship between every pair of categorical
    columns in a DataFrame

    Parameters
    ----------
    data : Blaze data
        A data with at least 2 columns having categorical values.
    columns : list of column names in data
        If not specified, uses ``autolyse.types(data)['groups']`` to identify
        all columns with categorical data.
    values : str, column name
        Optional column that contains weights to aggregate by summing up. By
        default, each row is counted as an observation.
    correction : boolean
        If True, and the degrees of freedom is 1, apply Yates' correction for
        continuity. The effect of the correction is to adjust each observed
        value by 0.5 towards the corresponding expected value. Defaults to False
        since Cramer's V (a more useful metric than chi-squared) must be computed
        without this correction.
    pairs_top: integer, Pick only top 10000 pairs by default
    details: boolean
        If True, will return observed and expected dataframes for pairs.
        Defaults to False.
    '''
    if columns is None:
        columns = types(data)['groups']

    parameters = ('p', 'chi2', 'dof', 'V')
    for index, column in itertools.combinations(columns, 2):
        agg_col = values if values in data.fields else column
        agg_func = bz.count(data[agg_col]) if agg_col == column else bz.sum(data[agg_col])
        data_grouped = bz.into(pd.DataFrame,
                               bz.by(bz.merge(data[index], data[column]),
                                     values=agg_func)
                               .sort('values')  # Generated SQL inefficient
                               .head(pairs_top))
        # BUG: bz.count: non-null count, gives 0 count for NULL groups
        # .nrows needs to fixed blaze/issues/1484
        # For now, we'll ignore NULL groups
        # Remove NULL groups
        data_grouped = data_grouped.dropna()
        if data_grouped.empty:
            result = {(index, column): {}}
        else:
            r = _crosstab(data_grouped[index],
                          column=data_grouped[column],
                          values=data_grouped['values'],
                          correction=correction)
            if details:
                result = {
                        'index': index,
                        'column': column,
                        'observed': r['observed'].to_json(),
                        'expected': r['expected'].to_json(),
                        'stats': {param: r[param] for param in parameters}
                }
            else:
                result = {
                        'index': index,
                        'column': column,
                        'stats': {param: r[param] for param in parameters}
                }

        yield result
Example #10
0
def crosstabs(data, columns=None, values=None,
              correction=False,
              pairs_top=10000,
              details=True):
    '''
    Identifies the strength of relationship between every pair of categorical
    columns in a DataFrame

    Parameters
    ----------
    data : Blaze data
        A data with at least 2 columns having categorical values.
    columns : list of column names in data
        If not specified, uses ``autolyse.types(data)['groups']`` to identify
        all columns with categorical data.
    values : str, column name
        Optional column that contains weights to aggregate by summing up. By
        default, each row is counted as an observation.
    correction : boolean
        If True, and the degrees of freedom is 1, apply Yates' correction for
        continuity. The effect of the correction is to adjust each observed
        value by 0.5 towards the corresponding expected value. Defaults to False
        since Cramer's V (a more useful metric than chi-squared) must be computed
        without this correction.
    pairs_top: integer, Pick only top 10000 pairs by default
    details: boolean
        If True, will return observed and expected dataframes for pairs.
        Defaults to False.
    '''
    if columns is None:
        columns = types(data)['groups']

    parameters = ('p', 'chi2', 'dof', 'V')
    for index, column in itertools.combinations(columns, 2):
        agg_col = values if values in data.fields else column
        agg_func = bz.count(data[agg_col]) if agg_col == column else bz.sum(data[agg_col])
        data_grouped = bz.into(pd.DataFrame,
                               bz.by(bz.merge(data[index], data[column]),
                                     values=agg_func)
                               .sort('values')  # Generated SQL inefficient
                               .head(pairs_top))
        # BUG: bz.count: non-null count, gives 0 count for NULL groups
        # .nrows needs to fixed blaze/issues/1484
        # For now, we'll ignore NULL groups
        # Remove NULL groups
        data_grouped = data_grouped.dropna()
        if data_grouped.empty:
            result = {(index, column): {}}
        else:
            r = _crosstab(data_grouped[index],
                          column=data_grouped[column],
                          values=data_grouped['values'],
                          correction=correction)
            if details:
                result = {
                        'index': index,
                        'column': column,
                        'observed': r['observed'].to_json(),
                        'expected': r['expected'].to_json(),
                        'stats': {param: r[param] for param in parameters}
                }
            else:
                result = {
                        'index': index,
                        'column': column,
                        'stats': {param: r[param] for param in parameters}
                }

        yield result