def test_simple(): if not os.path.exists('./noaa_data'): p = params(clevel=5, storage='./noaa_data') t = Table([], dshape='{f0: int, f1:int, f2:int, f3:float}', params=p) # TODO: chunkwise copy t.append(adapter[:]) t.commit() else: t = open('ctable://noaa_data') print '--------------------------------------' print 'mean', mean(t, 'f3') print 'std', std(t, 'f2') print '--------------------------------------' qs1 = select(t, lambda x: x > 80000, 'f0') qs2 = select2(t, lambda x, y: x > y, ['f0', 'f1']) result = t[qs1]
def test_simple(): if not os.path.exists('./noaa_data'): p = params(clevel=5, storage='./noaa_data') t = Table([], dshape='{f0: int, f1:int, f2:int, f3:float}', params=p) # TODO: chunkwise copy t.append(adapter[:]) t.commit() else: t = open('ctable://noaa_data') print '--------------------------------------' print 'mean', mean(t, 'f3') print 'std', std(t, 'f2') print '--------------------------------------' qs1 = select(t, lambda x: x > 80000, 'f0') qs2 = select2(t, lambda x,y: x > y, ['f0', 'f1']) result = t[qs1]
def test_mean(self, data): from blaze import mean assert compute(mean(t['amount']), data) == x['amount'].mean()
def groupmeans(data, groups, numbers, cutoff=0.01, quantile=0.95, minsize=None): """ Yields the significant differences in average between every pair of groups and numbers. Parameters ---------- data : blaze data object groups : non-empty iterable containing category column names in data numbers : non-empty iterable containing numeric column names in data cutoff : ignore anything with prob > cutoff. cutoff=None ignores significance checks, speeding it up a LOT. quantile : number that represents target improvement. Defaults to .95. The ``diff`` returned is the % impact of everyone moving to the 95th percentile minsize : each group should contain at least minsize values. If minsize=None, automatically set the minimum size to 1% of the dataset, or 10, whichever is larger. """ if minsize is None: minsize = max(data.nrows / 100, 10) means = {col: data[col].mean() for col in numbers} results = [] for group in groups: agg = {number: bz.mean(data[number]) for number in numbers} agg["#"] = bz.count(data) ave = bz.by(data[group], **agg).sort("#", ascending=False) ave = bz.into(pd.DataFrame, ave) ave.index = ave[group] sizes = ave["#"] # Each group should contain at least minsize values biggies = sizes[sizes >= minsize].index # ... and at least 2 groups overall, to compare. if len(biggies) < 2: continue for number in numbers: if number == group: continue sorted_cats = ave[number][biggies].dropna().sort_values() if len(sorted_cats) < 2: continue lo = bz.into(list, data[number][data[group] == sorted_cats.index[0]]) hi = bz.into(list, data[number][data[group] == sorted_cats.index[-1]]) _, prob = ttest_ind(np.ma.masked_array(lo, np.isnan(lo)), np.ma.masked_array(hi, np.isnan(hi))) if prob > cutoff: continue results.append( { "group": group, "number": number, "prob": prob, "gain": (sorted_cats.iloc[-1] / means[number] - 1)[0], "biggies": ave.ix[biggies][number], "means": ave[[number, "#"]].sort_values(by=number), } ) results = pd.DataFrame(results) if len(results) > 0: results = results.set_index(["group", "number"]) return results
def blaze_tutorial(): accounts = bl.Symbol('accounts', 'var * {id: int, name: string, amount: int}') deadbeats = accounts[accounts.amount < 0].name list_ = [[1, 'Alice', 100], [2, 'Bob', -200], [3, 'Charlie', 300], [4, 'Denis', 400], [5, 'Edith', -500]] print(list(bl.compute(deadbeats, list_))) df_ = bl.DataFrame(list_, columns=['id', 'name', 'amount']) print(bl.compute(deadbeats, df_)) bl_df_dir = dir(df_) df_ = pd.DataFrame(list_, columns=['id', 'name', 'amount']) print(df_[df_.amount < 0].name) pd_df_dir = dir(df_) print(len(bl_df_dir), len(pd_df_dir)) print(len([d for d in bl_df_dir if d in pd_df_dir])) print([d for d in bl_df_dir if d not in pd_df_dir]) print([d for d in pd_df_dir if d not in bl_df_dir]) df_ = bl.Data([(1, 'Alice', 100), (2, 'Bob', -200), (3, 'Charlie', 300), (4, 'Denis', 400), (5, 'Edith', -500)], fields=['id', 'name', 'balance']) print(repr(df_)) print(repr(df_[df_.balance < 0])) print(repr(df_[df_.balance < 0].name)) print(list(df_[df_.balance < 0].name)) iris = bl.Data('examples/iris.csv') print(repr(iris)) iris = bl.Data('sqlite:///examples/iris.db::iris') print(repr(iris)) print(repr(bl.by(iris.species, min=iris.petal_width.min(), max=iris.petal_width.max()))) result = bl.by(iris.species, min=iris.petal_width.min(), max=iris.petal_width.max()) print(odo(result, bl.DataFrame)) print(odo(result, pd.DataFrame)) ### odo has weird issue with unicode filenames, apparently... name = 'output.csv' print(odo(result, bl.CSV(name))) print(repr(iris.sepal_length.mean())) print(repr(bl.mean(iris.sepal_length))) print(repr(bl.by(iris.species, shortest=iris.petal_length.min(), longest=iris.petal_length.max(), average=iris.petal_length.mean()))) print(repr(iris.head())) iris = bl.transform(iris, sepal_ratio=iris.sepal_length / iris.sepal_width, petal_ratio=iris.petal_length / iris.petal_width) print(repr(iris.head())) versicolor = iris[iris.species.like('%versicolor')] print(repr(versicolor)) print((len(versicolor), len(versicolor.fields))) print(repr(iris.relabel(petal_length='PETAL-LENGTH', petal_width='PETAL-WIDTH'))) pd_df = pd.DataFrame({'name': ['Alice', 'Bob', 'Joe', 'Bob'], 'amount': [100, 200, 300, 400], 'id': [1, 2, 3, 4]}) # put the `df` DataFrame odo a Blaze Data object bl_df = bl.DataFrame(pd_df) bl_dt = bl.Data(pd_df) print(repr(pd_df.amount * 2)) print(repr(bl_df.amount * 2)) print(repr(bl_dt.amount * 2)) print(repr(pd_df[['id', 'amount']])) print(repr(bl_df[['id', 'amount']])) print(repr(bl_dt[['id', 'amount']])) print(repr(pd_df[pd_df.amount > 300])) print(repr(bl_df[bl_df.amount > 300])) print(repr(bl_dt[bl_dt.amount > 300])) print(repr(pd_df.groupby('name').amount.mean())) print(repr(pd_df.groupby(['name', 'id']).amount.mean())) print(repr(bl_df.groupby('name').amount.mean())) print(repr(bl_df.groupby(['name', 'id']).amount.mean())) print(repr(bl.by(bl_dt.name, amount=bl_dt.amount.mean()))) print(repr(bl.by(bl.merge(bl_dt.name, bl_dt.id), amount=bl_dt.amount.mean()))) #pd.merge(pd_df, pd_df2, on='name') #bl.join(bl_dt, bl_dt2, 'name') print(repr(pd_df.amount.map(lambda x: x + 1))) print(repr(bl_df.amount.map(lambda x: x + 1))) print(repr(bl_dt.amount.map(lambda x: x + 1, 'int64'))) print(repr(pd_df.rename(columns={'name': 'alias', 'amount': 'dollars'}))) print(repr(bl_df.rename(columns={'name': 'alias', 'amount': 'dollars'}))) print(repr(bl_dt.relabel(name='alias', amount='dollars'))) print(repr(pd_df.drop_duplicates())) print(repr(bl_df.drop_duplicates())) print(repr(bl_dt.distinct())) print(repr(pd_df.name.drop_duplicates())) print(repr(bl_df.name.drop_duplicates())) print(repr(bl_dt.name.distinct())) print(repr(pd_df.amount.mean())) print(repr(bl_df.amount.mean())) print(repr(bl_dt.amount.mean())) print(repr(pd_df)) print(repr(bl_df)) print(repr(bl_dt)) print(repr(pd_df.amount.value_counts()), '\n') print(repr(bl_df.amount.value_counts()), '\n') print(repr(bl_dt.amount.count_values()), '\n') print(repr(pd_df.dtypes), '\n') print(repr(bl_df.dtypes), '\n') print(repr(bl_df.columns), '\n') print(repr(bl_dt.dshape), '\n') print(repr(pd_df.amount.dtypes), '\n') print(repr(bl_df.amount.dtypes), '\n') print(repr(bl_dt.amount.dshape), '\n') print(type(pd_df), type(bl_df), type(bl_dt), '\n') os.remove('output.csv') for fn_ in glob.glob('*.csv.gz'): os.remove(fn_) return
b = carray(np.arange(N, dtype='f8')+1) t = ctable((a, b), ('f0', 'f1'), rootdir='example1', mode='w') t.flush() #------------------------------------------------------------------------ from time import time print '-------------------' t = blaze.open('ctable://example1') # Using chunked blaze array we can optimize for IO doing the sum # operations chunkwise from disk. t0 = time() print blaze.mean(t, 'f0') print "Chunked mean", round(time()-t0, 6) # Using NumPy is just going to through the iterator protocol on # carray which isn't going to efficient. t0 = time() print np.mean(t.data.ca['f0']) print "NumPy mean", round(time()-t0, 6) print '===================' t0 = time() #assert blaze.std(t, 'f0') == 28867.513458037913 print blaze.std(t, 'f0') print "Chunked std", round(time()-t0, 6)
def groupmeans(data, groups, numbers, cutoff=.01, quantile=.95, min_size=None): ''' Yields the significant differences in average between every pair of groups and numbers. Parameters ---------- data : blaze data object groups : non-empty iterable containing category column names in data numbers : non-empty iterable containing numeric column names in data cutoff : ignore anything with prob > cutoff. cutoff=None ignores significance checks, speeding it up a LOT. quantile : number that represents target improvement. Defaults to .95. The ``diff`` returned is the % impact of everyone moving to the 95th percentile min_size : each group should contain at least min_size values. If min_size=None, automatically set the minimum size to 1% of the dataset, or 10, whichever is larger. ''' if min_size is None: # compute nrows, bz.compute(data.nrows) doesn't work for sqlite min_size = max(bz.into(int, data.nrows) / 100, 10) # compute mean of each number column means = {col: bz.into(float, data[col].mean()) for col in numbers} # pre-create aggregation expressions (mean, count) agg = {number: bz.mean(data[number]) for number in numbers} for group in groups: agg['#'] = data[group].count() ave = bz.by(data[group], **agg).sort('#', ascending=False) ave = bz.into(pd.DataFrame, ave) ave.index = ave[group] sizes = ave['#'] # Each group should contain at least min_size values biggies = sizes[sizes >= min_size].index # ... and at least 2 groups overall, to compare. if len(biggies) < 2: continue for number in numbers: if number == group: continue sorted_cats = ave[number][biggies].dropna().sort_values() if len(sorted_cats) < 2: continue sohi = sorted_cats.index[-1] solo = sorted_cats.index[0] # If sorted_cats.index items are of numpy type, then # convert them to native type, skip conversion for unicode, str # See https://github.com/blaze/blaze/issues/1461 if isinstance(solo, np.generic): solo, sohi = solo.item(), sohi.item() lo = bz.into(list, data[number][data[group] == solo]) hi = bz.into(list, data[number][data[group] == sohi]) _, prob = ttest_ind( np.ma.masked_array(lo, np.isnan(lo)), np.ma.masked_array(hi, np.isnan(hi)) ) # All results will be returned by default # Up to the user to ignore or show insignificant results # Uncomment below two lines to return only significant results # if prob > cutoff: # continue yield ({ 'group': group, 'number': number, 'prob': float(prob), 'gain': sorted_cats.iloc[-1] / means[number] - 1, 'biggies': ave.ix[biggies][number].to_dict(), 'means': ave[[number, '#']].sort_values(by=number).reset_index().to_dict( orient='records'), })
def groupmeans(data, groups, numbers, cutoff=.01, quantile=.95, min_size=None): ''' Yields the significant differences in average between every pair of groups and numbers. Parameters ---------- data : blaze data object groups : non-empty iterable containing category column names in data numbers : non-empty iterable containing numeric column names in data cutoff : ignore anything with prob > cutoff. cutoff=None ignores significance checks, speeding it up a LOT. quantile : number that represents target improvement. Defaults to .95. The ``diff`` returned is the % impact of everyone moving to the 95th percentile min_size : each group should contain at least min_size values. If min_size=None, automatically set the minimum size to 1% of the dataset, or 10, whichever is larger. ''' if min_size is None: # compute nrows, bz.compute(data.nrows) doesn't work for sqlite min_size = max(bz.into(int, data.nrows) / 100, 10) # compute mean of each number column means = {col: bz.into(float, data[col].mean()) for col in numbers} # pre-create aggregation expressions (mean, count) agg = {number: bz.mean(data[number]) for number in numbers} for group in groups: agg['#'] = data[group].count() ave = bz.by(data[group], **agg).sort('#', ascending=False) ave = bz.into(pd.DataFrame, ave) ave.index = ave[group] sizes = ave['#'] # Each group should contain at least min_size values biggies = sizes[sizes >= min_size].index # ... and at least 2 groups overall, to compare. if len(biggies) < 2: continue for number in numbers: if number == group: continue sorted_cats = ave[number][biggies].dropna().sort_values() if len(sorted_cats) < 2: continue sohi = sorted_cats.index[-1] solo = sorted_cats.index[0] # If sorted_cats.index items are of numpy type, then # convert them to native type, skip conversion for unicode, str # See https://github.com/blaze/blaze/issues/1461 if isinstance(solo, np.generic): solo, sohi = solo.item(), sohi.item() lo = bz.into(list, data[number][data[group] == solo]) hi = bz.into(list, data[number][data[group] == sohi]) _, prob = ttest_ind( np.ma.masked_array(lo, np.isnan(lo)), np.ma.masked_array(hi, np.isnan(hi)) ) if prob > cutoff: continue yield ({ 'group': group, 'number': number, 'prob': float(prob), 'gain': sorted_cats.iloc[-1] / means[number] - 1, 'biggies': ave.ix[biggies][number].to_dict(), 'means': ave[[number, '#']].sort_values(by=number).reset_index().to_dict( orient='records'), })