Exemple #1
0
def test_simple():
    if not os.path.exists('./noaa_data'):
        p = params(clevel=5, storage='./noaa_data')

        t = Table([], dshape='{f0: int, f1:int, f2:int, f3:float}', params=p)

        # TODO: chunkwise copy
        t.append(adapter[:])
        t.commit()
    else:
        t = open('ctable://noaa_data')

    print '--------------------------------------'
    print 'mean', mean(t, 'f3')
    print 'std', std(t, 'f2')
    print '--------------------------------------'

    qs1 = select(t, lambda x: x > 80000, 'f0')
    qs2 = select2(t, lambda x, y: x > y, ['f0', 'f1'])

    result = t[qs1]
Exemple #2
0
def test_simple():
    if not os.path.exists('./noaa_data'):
        p = params(clevel=5, storage='./noaa_data')

        t = Table([], dshape='{f0: int, f1:int, f2:int, f3:float}', params=p)

        # TODO: chunkwise copy
        t.append(adapter[:])
        t.commit()
    else:
        t = open('ctable://noaa_data')

    print '--------------------------------------'
    print 'mean', mean(t, 'f3')
    print 'std', std(t, 'f2')
    print '--------------------------------------'

    qs1 = select(t, lambda x: x > 80000, 'f0')
    qs2 = select2(t, lambda x,y: x > y, ['f0', 'f1'])

    result = t[qs1]
 def test_mean(self, data):
     from blaze import mean
     assert compute(mean(t['amount']), data) == x['amount'].mean()
 def test_mean(self, data):
     from blaze import mean
     assert compute(mean(t['amount']), data) == x['amount'].mean()
Exemple #5
0
def groupmeans(data, groups, numbers, cutoff=0.01, quantile=0.95, minsize=None):
    """
    Yields the significant differences in average between every pair of
    groups and numbers.

    Parameters
    ----------
    data : blaze data object
    groups : non-empty iterable containing category column names in data
    numbers : non-empty iterable containing numeric column names in data
    cutoff : ignore anything with prob > cutoff.
        cutoff=None ignores significance checks, speeding it up a LOT.
    quantile : number that represents target improvement. Defaults to .95.
        The ``diff`` returned is the % impact of everyone moving to the 95th
        percentile
    minsize : each group should contain at least minsize values.
        If minsize=None, automatically set the minimum size to
        1% of the dataset, or 10, whichever is larger.
    """

    if minsize is None:
        minsize = max(data.nrows / 100, 10)

    means = {col: data[col].mean() for col in numbers}
    results = []

    for group in groups:
        agg = {number: bz.mean(data[number]) for number in numbers}
        agg["#"] = bz.count(data)
        ave = bz.by(data[group], **agg).sort("#", ascending=False)
        ave = bz.into(pd.DataFrame, ave)
        ave.index = ave[group]
        sizes = ave["#"]

        # Each group should contain at least minsize values
        biggies = sizes[sizes >= minsize].index
        # ... and at least 2 groups overall, to compare.
        if len(biggies) < 2:
            continue
        for number in numbers:
            if number == group:
                continue
            sorted_cats = ave[number][biggies].dropna().sort_values()
            if len(sorted_cats) < 2:
                continue
            lo = bz.into(list, data[number][data[group] == sorted_cats.index[0]])
            hi = bz.into(list, data[number][data[group] == sorted_cats.index[-1]])
            _, prob = ttest_ind(np.ma.masked_array(lo, np.isnan(lo)), np.ma.masked_array(hi, np.isnan(hi)))
            if prob > cutoff:
                continue
            results.append(
                {
                    "group": group,
                    "number": number,
                    "prob": prob,
                    "gain": (sorted_cats.iloc[-1] / means[number] - 1)[0],
                    "biggies": ave.ix[biggies][number],
                    "means": ave[[number, "#"]].sort_values(by=number),
                }
            )

    results = pd.DataFrame(results)
    if len(results) > 0:
        results = results.set_index(["group", "number"])
    return results
def blaze_tutorial():
    accounts = bl.Symbol('accounts',
                         'var * {id: int, name: string, amount: int}')
    deadbeats = accounts[accounts.amount < 0].name

    list_ = [[1, 'Alice', 100],
             [2, 'Bob', -200],
             [3, 'Charlie', 300],
             [4, 'Denis', 400],
             [5, 'Edith', -500]]

    print(list(bl.compute(deadbeats, list_)))

    df_ = bl.DataFrame(list_, columns=['id', 'name', 'amount'])
    print(bl.compute(deadbeats, df_))
    bl_df_dir = dir(df_)

    df_ = pd.DataFrame(list_, columns=['id', 'name', 'amount'])
    print(df_[df_.amount < 0].name)
    pd_df_dir = dir(df_)

    print(len(bl_df_dir), len(pd_df_dir))
    print(len([d for d in bl_df_dir if d in pd_df_dir]))
    print([d for d in bl_df_dir if d not in pd_df_dir])
    print([d for d in pd_df_dir if d not in bl_df_dir])

    df_ = bl.Data([(1, 'Alice', 100),
                   (2, 'Bob', -200),
                   (3, 'Charlie', 300),
                   (4, 'Denis', 400),
                   (5, 'Edith', -500)],
                  fields=['id', 'name', 'balance'])

    print(repr(df_))
    print(repr(df_[df_.balance < 0]))

    print(repr(df_[df_.balance < 0].name))
    print(list(df_[df_.balance < 0].name))

    iris = bl.Data('examples/iris.csv')
    print(repr(iris))

    iris = bl.Data('sqlite:///examples/iris.db::iris')
    print(repr(iris))

    print(repr(bl.by(iris.species, min=iris.petal_width.min(),
                     max=iris.petal_width.max())))

    result = bl.by(iris.species, min=iris.petal_width.min(),
                   max=iris.petal_width.max())

    print(odo(result, bl.DataFrame))
    print(odo(result, pd.DataFrame))

    ### odo has weird issue with unicode filenames, apparently...
    name = 'output.csv'
    print(odo(result, bl.CSV(name)))

    print(repr(iris.sepal_length.mean()))
    print(repr(bl.mean(iris.sepal_length)))

    print(repr(bl.by(iris.species, shortest=iris.petal_length.min(),
                     longest=iris.petal_length.max(),
                     average=iris.petal_length.mean())))

    print(repr(iris.head()))

    iris = bl.transform(iris, sepal_ratio=iris.sepal_length / iris.sepal_width,
                        petal_ratio=iris.petal_length / iris.petal_width)
    print(repr(iris.head()))

    versicolor = iris[iris.species.like('%versicolor')]
    print(repr(versicolor))

    print((len(versicolor), len(versicolor.fields)))

    print(repr(iris.relabel(petal_length='PETAL-LENGTH',
                            petal_width='PETAL-WIDTH')))

    pd_df = pd.DataFrame({'name': ['Alice', 'Bob', 'Joe', 'Bob'],
                          'amount': [100, 200, 300, 400],
                          'id': [1, 2, 3, 4]})

    # put the `df` DataFrame odo a Blaze Data object
    bl_df = bl.DataFrame(pd_df)
    bl_dt = bl.Data(pd_df)

    print(repr(pd_df.amount * 2))
    print(repr(bl_df.amount * 2))
    print(repr(bl_dt.amount * 2))

    print(repr(pd_df[['id', 'amount']]))
    print(repr(bl_df[['id', 'amount']]))
    print(repr(bl_dt[['id', 'amount']]))

    print(repr(pd_df[pd_df.amount > 300]))
    print(repr(bl_df[bl_df.amount > 300]))
    print(repr(bl_dt[bl_dt.amount > 300]))

    print(repr(pd_df.groupby('name').amount.mean()))
    print(repr(pd_df.groupby(['name', 'id']).amount.mean()))

    print(repr(bl_df.groupby('name').amount.mean()))
    print(repr(bl_df.groupby(['name', 'id']).amount.mean()))

    print(repr(bl.by(bl_dt.name, amount=bl_dt.amount.mean())))
    print(repr(bl.by(bl.merge(bl_dt.name, bl_dt.id),
                     amount=bl_dt.amount.mean())))

    #pd.merge(pd_df, pd_df2, on='name')
    #bl.join(bl_dt, bl_dt2, 'name')

    print(repr(pd_df.amount.map(lambda x: x + 1)))
    print(repr(bl_df.amount.map(lambda x: x + 1)))
    print(repr(bl_dt.amount.map(lambda x: x + 1, 'int64')))

    print(repr(pd_df.rename(columns={'name': 'alias', 'amount': 'dollars'})))
    print(repr(bl_df.rename(columns={'name': 'alias', 'amount': 'dollars'})))
    print(repr(bl_dt.relabel(name='alias', amount='dollars')))

    print(repr(pd_df.drop_duplicates()))
    print(repr(bl_df.drop_duplicates()))
    print(repr(bl_dt.distinct()))

    print(repr(pd_df.name.drop_duplicates()))
    print(repr(bl_df.name.drop_duplicates()))
    print(repr(bl_dt.name.distinct()))

    print(repr(pd_df.amount.mean()))
    print(repr(bl_df.amount.mean()))
    print(repr(bl_dt.amount.mean()))

    print(repr(pd_df))
    print(repr(bl_df))
    print(repr(bl_dt))

    print(repr(pd_df.amount.value_counts()), '\n')
    print(repr(bl_df.amount.value_counts()), '\n')
    print(repr(bl_dt.amount.count_values()), '\n')

    print(repr(pd_df.dtypes), '\n')
    print(repr(bl_df.dtypes), '\n')
    print(repr(bl_df.columns), '\n')
    print(repr(bl_dt.dshape), '\n')

    print(repr(pd_df.amount.dtypes), '\n')
    print(repr(bl_df.amount.dtypes), '\n')
    print(repr(bl_dt.amount.dshape), '\n')

    print(type(pd_df), type(bl_df), type(bl_dt), '\n')

    os.remove('output.csv')
    for fn_ in glob.glob('*.csv.gz'):
        os.remove(fn_)

    return
Exemple #7
0
    b = carray(np.arange(N, dtype='f8')+1)
    t = ctable((a, b), ('f0', 'f1'), rootdir='example1', mode='w')
    t.flush()
#------------------------------------------------------------------------

from time import time

print '-------------------'

t = blaze.open('ctable://example1')

# Using chunked blaze array we can optimize for IO doing the sum
# operations chunkwise from disk.

t0 = time()
print blaze.mean(t, 'f0')
print "Chunked mean", round(time()-t0, 6)

# Using NumPy is just going to through the iterator protocol on
# carray which isn't going to efficient.

t0 = time()
print np.mean(t.data.ca['f0'])
print "NumPy mean", round(time()-t0, 6)

print '==================='

t0 = time()
#assert blaze.std(t, 'f0') == 28867.513458037913
print blaze.std(t, 'f0')
print "Chunked std", round(time()-t0, 6)
Exemple #8
0
def groupmeans(data, groups, numbers,
               cutoff=.01,
               quantile=.95,
               min_size=None):
    '''
    Yields the significant differences in average between every pair of
    groups and numbers.

    Parameters
    ----------
    data : blaze data object
    groups : non-empty iterable containing category column names in data
    numbers : non-empty iterable containing numeric column names in data
    cutoff : ignore anything with prob > cutoff.
        cutoff=None ignores significance checks, speeding it up a LOT.
    quantile : number that represents target improvement. Defaults to .95.
        The ``diff`` returned is the % impact of everyone moving to the 95th
        percentile
    min_size : each group should contain at least min_size values.
        If min_size=None, automatically set the minimum size to
        1% of the dataset, or 10, whichever is larger.
    '''

    if min_size is None:
        # compute nrows, bz.compute(data.nrows) doesn't work for sqlite
        min_size = max(bz.into(int, data.nrows) / 100, 10)

    # compute mean of each number column
    means = {col: bz.into(float, data[col].mean()) for col in numbers}
    # pre-create aggregation expressions (mean, count)
    agg = {number: bz.mean(data[number]) for number in numbers}
    for group in groups:
        agg['#'] = data[group].count()
        ave = bz.by(data[group], **agg).sort('#', ascending=False)
        ave = bz.into(pd.DataFrame, ave)
        ave.index = ave[group]
        sizes = ave['#']
        # Each group should contain at least min_size values
        biggies = sizes[sizes >= min_size].index
        # ... and at least 2 groups overall, to compare.
        if len(biggies) < 2:
            continue
        for number in numbers:
            if number == group:
                continue
            sorted_cats = ave[number][biggies].dropna().sort_values()
            if len(sorted_cats) < 2:
                continue
            sohi = sorted_cats.index[-1]
            solo = sorted_cats.index[0]

            # If sorted_cats.index items are of numpy type, then
            # convert them to native type, skip conversion for unicode, str
            # See https://github.com/blaze/blaze/issues/1461
            if isinstance(solo, np.generic):
                solo, sohi = solo.item(), sohi.item()

            lo = bz.into(list, data[number][data[group] == solo])
            hi = bz.into(list, data[number][data[group] == sohi])
            _, prob = ttest_ind(
                np.ma.masked_array(lo, np.isnan(lo)),
                np.ma.masked_array(hi, np.isnan(hi))
            )
            # All results will be returned by default
            # Up to the user to ignore or show insignificant results
            # Uncomment below two lines to return only significant results
            # if prob > cutoff:
            #    continue

            yield ({
                'group': group,
                'number': number,
                'prob': float(prob),
                'gain': sorted_cats.iloc[-1] / means[number] - 1,
                'biggies': ave.ix[biggies][number].to_dict(),
                'means': ave[[number, '#']].sort_values(by=number).reset_index().to_dict(
                    orient='records'),
            })
Exemple #9
0
def groupmeans(data, groups, numbers,
               cutoff=.01,
               quantile=.95,
               min_size=None):
    '''
    Yields the significant differences in average between every pair of
    groups and numbers.

    Parameters
    ----------
    data : blaze data object
    groups : non-empty iterable containing category column names in data
    numbers : non-empty iterable containing numeric column names in data
    cutoff : ignore anything with prob > cutoff.
        cutoff=None ignores significance checks, speeding it up a LOT.
    quantile : number that represents target improvement. Defaults to .95.
        The ``diff`` returned is the % impact of everyone moving to the 95th
        percentile
    min_size : each group should contain at least min_size values.
        If min_size=None, automatically set the minimum size to
        1% of the dataset, or 10, whichever is larger.
    '''

    if min_size is None:
        # compute nrows, bz.compute(data.nrows) doesn't work for sqlite
        min_size = max(bz.into(int, data.nrows) / 100, 10)

    # compute mean of each number column
    means = {col: bz.into(float, data[col].mean()) for col in numbers}
    # pre-create aggregation expressions (mean, count)
    agg = {number: bz.mean(data[number]) for number in numbers}
    for group in groups:
        agg['#'] = data[group].count()
        ave = bz.by(data[group], **agg).sort('#', ascending=False)
        ave = bz.into(pd.DataFrame, ave)
        ave.index = ave[group]
        sizes = ave['#']
        # Each group should contain at least min_size values
        biggies = sizes[sizes >= min_size].index
        # ... and at least 2 groups overall, to compare.
        if len(biggies) < 2:
            continue
        for number in numbers:
            if number == group:
                continue
            sorted_cats = ave[number][biggies].dropna().sort_values()
            if len(sorted_cats) < 2:
                continue
            sohi = sorted_cats.index[-1]
            solo = sorted_cats.index[0]

            # If sorted_cats.index items are of numpy type, then
            # convert them to native type, skip conversion for unicode, str
            # See https://github.com/blaze/blaze/issues/1461
            if isinstance(solo, np.generic):
                solo, sohi = solo.item(), sohi.item()

            lo = bz.into(list, data[number][data[group] == solo])
            hi = bz.into(list, data[number][data[group] == sohi])

            _, prob = ttest_ind(
                np.ma.masked_array(lo, np.isnan(lo)),
                np.ma.masked_array(hi, np.isnan(hi))
            )
            if prob > cutoff:
                continue

            yield ({
                'group': group,
                'number': number,
                'prob': float(prob),
                'gain': sorted_cats.iloc[-1] / means[number] - 1,
                'biggies': ave.ix[biggies][number].to_dict(),
                'means': ave[[number, '#']].sort_values(by=number).reset_index().to_dict(
                    orient='records'),
            })