def test_groupby_01(self): """ test_groupby_01: Test groupby's group creation (groupby single row rsults into multiple groups) """ random.seed(1) groupby_cols = ['f0'] groupby_lambda = lambda x: x[0] # no operation is specified in `agg_list`, so `sum` is used by default. agg_list = ['f4', 'f5', 'f6'] num_rows = 2000 # -- Data -- g = self.gen_almost_unique_row(num_rows) data = np.fromiter(g, dtype='S1,f8,i8,i4,f8,i8,i4') # -- Bcolz -- print('--> Bcolz') self.rootdir = tempfile.mkdtemp(prefix='bcolz-') os.rmdir(self.rootdir) # folder should be emtpy fact_bcolz = bquery.ctable(data, rootdir=self.rootdir) fact_bcolz.flush() fact_bcolz.cache_factor(groupby_cols, refresh=True) result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list) print(result_bcolz) # Itertools result print('--> Itertools') result_itt = self.helper_itt_groupby(data, groupby_lambda) uniquekeys = result_itt['uniquekeys'] print(uniquekeys) assert_list_equal(list(result_bcolz['f0']), uniquekeys)
def open(rootdir, mode='a'): # ---------------------------------------------------------------------- # https://github.com/Blosc/bcolz/blob/master/bcolz/toplevel.py#L104-L132 # ---------------------------------------------------------------------- """ open(rootdir, mode='a') Open a disk-based carray/ctable. This function could be used to open bcolz objects as bquery objects to perform queries on them. Parameters ---------- rootdir : pathname (string) The directory hosting the carray/ctable object. mode : the open mode (string) Specifies the mode in which the object is opened. The supported values are: * 'r' for read-only * 'w' for emptying the previous underlying data * 'a' for allowing read/write on top of existing data Returns ------- out : a carray/ctable object or IOError (if not objects are found) """ # First try with a carray rootsfile = os.path.join(rootdir, ROOTDIRS) if os.path.exists(rootsfile): return bquery.ctable(rootdir=rootdir, mode=mode) else: return bquery.carray(rootdir=rootdir, mode=mode)
def test_groupby_01(self): """ test_groupby_01: Test groupby's group creation (groupby single row rsults into multiple groups) """ random.seed(1) groupby_cols = ['f0'] groupby_lambda = lambda x: x[0] agg_list = ['f4', 'f5', 'f6'] num_rows = 2000 # -- Data -- g = self.gen_almost_unique_row(num_rows) data = np.fromiter(g, dtype='S1,f8,i8,i4,f8,i8,i4') # -- Bcolz -- print('--> Bcolz') self.rootdir = tempfile.mkdtemp(prefix='bcolz-') os.rmdir(self.rootdir) # folder should be emtpy fact_bcolz = bquery.ctable(data, rootdir=self.rootdir) fact_bcolz.flush() fact_bcolz.cache_factor(groupby_cols, refresh=True) result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list) print result_bcolz # Itertools result print('--> Itertools') result_itt = self.helper_itt_groupby(data, groupby_lambda) uniquekeys = result_itt['uniquekeys'] print uniquekeys assert_list_equal(list(result_bcolz['f0']), uniquekeys)
def test_groupby_05(self): """ test_groupby_05: Test groupby's group creation without cache Groupby type 'sum' """ random.seed(1) groupby_cols = ['f0'] groupby_lambda = lambda x: x[0] agg_list = ['f1'] num_rows = 200 for _dtype in \ [ 'i8', 'i4', 'f8', 'S1', ]: # -- Data -- if _dtype == 'S1': iterable = ((str(x % 5), x % 5) for x in range(num_rows)) else: iterable = ((x % 5, x % 5) for x in range(num_rows)) data = np.fromiter(iterable, dtype=_dtype + ',i8') # -- Bcolz -- print('--> Bcolz') self.rootdir = tempfile.mkdtemp(prefix='bcolz-') os.rmdir(self.rootdir) # folder should be emtpy fact_bcolz = bquery.ctable(data, rootdir=self.rootdir) fact_bcolz.flush() result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list) print(result_bcolz) # Itertools result print('--> Itertools') result_itt = self.helper_itt_groupby(data, groupby_lambda) uniquekeys = result_itt['uniquekeys'] print(uniquekeys) ref = [] for item in result_itt['groups']: f1 = 0 for row in item: f0 = row[0] f1 += row[1] ref.append([f0] + [f1]) assert_list_equal( sorted([list(x) for x in result_bcolz]), sorted(ref)) yield self._assert_list_equal, list(result_bcolz['f0']), uniquekeys
def on_disk_data_cleaner(self, data): self.rootdir = tempfile.mkdtemp(prefix='bcolz-') os.rmdir(self.rootdir) # folder should be emtpy ct = bquery.ctable(data, rootdir=self.rootdir) # print(ct) ct.flush() ct = bquery.open(self.rootdir) yield ct shutil.rmtree(self.rootdir) self.rootdir = None
def on_disk_data_cleaner(self, data): self.rootdir = tempfile.mkdtemp(prefix='bcolz-') os.rmdir(self.rootdir) # folder should be emtpy ct = bquery.ctable(data, rootdir=self.rootdir) # print ct ct.flush() ct = bquery.open(self.rootdir) yield ct shutil.rmtree(self.rootdir) self.rootdir = None
def uncompress_groupby_to_df(self, result_tar, groupby_col_list, agg_list, where_terms_list, aggregate=False): # uncompress result returned by the groupby and convert it to a Pandas DataFrame tmp_dir = None try: try: tar_file = TarFile(fileobj=StringIO(result_tar)) tmp_dir = tempfile.mkdtemp(prefix='tar_dir_') tar_file.extractall(tmp_dir) except TarError: self.logger.exception("Could not create/extract tar.") raise ValueError(result_tar) del result_tar del tar_file ct = None # now untar and aggregate the individual shard results for i, sub_tar in enumerate(glob.glob(os.path.join(tmp_dir, '*'))): new_dir = os.path.join(tmp_dir, 'bcolz_' + str(i)) rm_file_or_dir(new_dir) with tarfile.open(sub_tar, mode='r') as tar_file: tar_file.extractall(new_dir) # rm_file_or_dir(sub_tar) ctable_dir = glob.glob(os.path.join(new_dir, '*'))[0] new_ct = ctable(rootdir=ctable_dir, mode='a') if i == 0: ct = new_ct else: ct.append(new_ct) # aggregate by groupby parameters if ct is None: result_df = pd.DataFrame() elif aggregate: new_dir = os.path.join(tmp_dir, 'end_result') rm_file_or_dir(new_dir) # we can only sum now new_agg_list = [[x[2], 'sum', x[2]] for x in agg_list] result_ctable = ct.groupby(groupby_col_list, new_agg_list, rootdir=new_dir) result_df = result_ctable.todataframe() else: result_df = ct.todataframe() finally: rm_file_or_dir(tmp_dir) return result_df
def test_groupby_04(self): """ test_groupby_04: Test groupby's aggregation (groupby over multiple rows results into multiple groups) Groupby type 'sum' """ random.seed(1) groupby_cols = ['f0', 'f1', 'f2'] groupby_lambda = lambda x: [x[0], x[1], x[2]] agg_list = ['f4', 'f5', 'f6'] agg_lambda = lambda x: [x[4], x[5], x[6]] num_rows = 2000 # -- Data -- g = self.gen_almost_unique_row(num_rows) data = np.fromiter(g, dtype='S1,f8,i8,i4,f8,i8,i4') # -- Bcolz -- print('--> Bcolz') self.rootdir = tempfile.mkdtemp(prefix='bcolz-') os.rmdir(self.rootdir) # folder should be emtpy fact_bcolz = bquery.ctable(data, rootdir=self.rootdir) fact_bcolz.flush() fact_bcolz.cache_factor(groupby_cols, refresh=True) result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list) print(result_bcolz) # Itertools result print('--> Itertools') result_itt = self.helper_itt_groupby(data, groupby_lambda) uniquekeys = result_itt['uniquekeys'] print(uniquekeys) ref = [] for item in result_itt['groups']: f4 = 0 f5 = 0 f6 = 0 for row in item: f0 = groupby_lambda(row) f4 += row[4] f5 += row[5] f6 += row[6] ref.append(f0 + [f4, f5, f6]) assert_list_equal( sorted([list(x) for x in result_bcolz]), sorted(ref))
def test_groupby_15(self): """ test_groupby_15: Groupby type 'std' """ random.seed(1) groupby_cols = ['f0'] groupby_lambda = lambda x: x[0] agg_list = [['f4', 'std'], ['f5', 'std'], ['f6', 'std']] agg_lambda = lambda x: [x[4], x[5], x[6]] num_rows = 2000 # -- Data -- g = self.gen_almost_unique_row(num_rows) data = np.fromiter(g, dtype='S1,f8,i8,i4,f8,i8,i4') # -- Bcolz -- print('--> Bcolz') self.rootdir = tempfile.mkdtemp(prefix='bcolz-') os.rmdir(self.rootdir) # folder should be emtpy fact_bcolz = bquery.ctable(data, rootdir=self.rootdir) fact_bcolz.flush() fact_bcolz.cache_factor(groupby_cols, refresh=True) result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list) print(result_bcolz) # Itertools result print('--> Itertools') result_itt = self.helper_itt_groupby(data, groupby_lambda) uniquekeys = result_itt['uniquekeys'] print(uniquekeys) ref = [] for item in result_itt['groups']: f4 = [] f5 = [] f6 = [] for row in item: f0 = groupby_lambda(row) f4.append(row[4]) f5.append(row[5]) f6.append(row[6]) ref.append([np.std(f4), np.std(f5), np.std(f6)]) # remove the first (text) element for floating point comparison result = [list(x[1:]) for x in result_bcolz] assert_allclose(result, ref, rtol=1e-10)
def test_groupby_07(self): """ test_groupby_07: Groupby type 'count_na' """ random.seed(1) groupby_cols = ['f0'] groupby_lambda = lambda x: x[0] agg_list = ['f4', 'f5', 'f6'] num_rows = 1000 # -- Data -- g = self.gen_dataset_count_with_NA(num_rows) data = np.fromiter(g, dtype='S1,f8,i8,i4,f8,i8,i4') # -- Bcolz -- print('--> Bcolz') self.rootdir = tempfile.mkdtemp(prefix='bcolz-') os.rmdir(self.rootdir) # folder should be emtpy fact_bcolz = bquery.ctable(data, rootdir=self.rootdir) fact_bcolz.flush() fact_bcolz.cache_factor(groupby_cols, refresh=True) result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list, agg_method='count_na') print result_bcolz # Itertools result print('--> Itertools') result_itt = self.helper_itt_groupby(data, groupby_lambda) uniquekeys = result_itt['uniquekeys'] print uniquekeys ref = [] for item in result_itt['groups']: f4 = 0 f5 = 0 f6 = 0 for row in item: f0 = groupby_lambda(row) if row[4] == row[4]: f4 += 1 f5 += 1 f6 += 1 ref.append([f0, f4, f5, f6]) assert_list_equal( [list(x) for x in result_bcolz], ref)
def test_groupby_07(self): """ test_groupby_07: Groupby type 'count' """ random.seed(1) groupby_cols = ['f0'] groupby_lambda = lambda x: x[0] agg_list = [['f4', 'count'], ['f5', 'count'], ['f6', 'count']] num_rows = 1000 # -- Data -- g = self.gen_dataset_count_with_NA(num_rows) data = np.fromiter(g, dtype='S1,f8,i8,i4,f8,i8,i4') # -- Bcolz -- print('--> Bcolz') self.rootdir = tempfile.mkdtemp(prefix='bcolz-') os.rmdir(self.rootdir) # folder should be emtpy fact_bcolz = bquery.ctable(data, rootdir=self.rootdir) fact_bcolz.flush() fact_bcolz.cache_factor(groupby_cols, refresh=True) result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list) print(result_bcolz) # Itertools result print('--> Itertools') result_itt = self.helper_itt_groupby(data, groupby_lambda) uniquekeys = result_itt['uniquekeys'] print(uniquekeys) ref = [] for item in result_itt['groups']: f4 = 0 f5 = 0 f6 = 0 for row in item: f0 = groupby_lambda(row) if row[4] == row[4]: f4 += 1 f5 += 1 f6 += 1 ref.append([f0, f4, f5, f6]) assert_list_equal( [list(x) for x in result_bcolz], ref)
def test_groupby_09(self): """ test_groupby_08: Groupby's type 'sorted_count_distinct' """ random.seed(1) groupby_cols = ['f0'] groupby_lambda = lambda x: x[0] agg_list = ['f4', 'f5', 'f6'] num_rows = 1000 # -- Data -- g = self.gen_dataset_count_with_NA_09(num_rows) data = np.fromiter(g, dtype='S1,f8,i8,i4,f8,i8,i4') print 'data' print data # -- Bcolz -- print('--> Bcolz') self.rootdir = tempfile.mkdtemp(prefix='bcolz-') os.rmdir(self.rootdir) # folder should be emtpy fact_bcolz = bquery.ctable(data, rootdir=self.rootdir) fact_bcolz.flush() result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list, agg_method='sorted_count_distinct') print result_bcolz # # Itertools result print('--> Itertools') result_itt = self.helper_itt_groupby(data, groupby_lambda) uniquekeys = result_itt['uniquekeys'] print uniquekeys ref = [] for n, (u, item) in enumerate(zip(uniquekeys, result_itt['groups'])): f4 = len(self._get_unique([x[4] for x in result_itt['groups'][n]])) f5 = len(self._get_unique([x[5] for x in result_itt['groups'][n]])) f6 = len(self._get_unique([x[6] for x in result_itt['groups'][n]])) ref.append([u, f4, f5, f6]) print ref assert_list_equal( [list(x) for x in result_bcolz], ref)
def test_groupby_09(self): """ test_groupby_09: Groupby's type 'sorted_count_distinct' """ random.seed(1) groupby_cols = ['f0'] groupby_lambda = lambda x: x[0] agg_list = [['f4', 'sorted_count_distinct'], ['f5', 'sorted_count_distinct'], ['f6', 'sorted_count_distinct']] num_rows = 2000 # -- Data -- g = self.gen_dataset_count_with_NA_09(num_rows) sort = sorted([item for item in g], key=lambda x: x[0]) data = np.fromiter(sort, dtype='S1,f8,i8,i4,f8,i8,i4') print('data') print(data) # -- Bcolz -- print('--> Bcolz') self.rootdir = tempfile.mkdtemp(prefix='bcolz-') os.rmdir(self.rootdir) # folder should be emtpy fact_bcolz = bquery.ctable(data, rootdir=self.rootdir) fact_bcolz.flush() result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list) print(result_bcolz) # # Itertools result print('--> Itertools') result_itt = self.helper_itt_groupby(data, groupby_lambda) uniquekeys = result_itt['uniquekeys'] print(uniquekeys) ref = [] for n, (u, item) in enumerate(zip(uniquekeys, result_itt['groups'])): f4 = len(self._get_unique([x[4] for x in result_itt['groups'][n]])) f5 = len(self._get_unique([x[5] for x in result_itt['groups'][n]])) f6 = len(self._get_unique([x[6] for x in result_itt['groups'][n]])) ref.append([u, f4, f5, f6]) print(ref) assert_list_equal( [list(x) for x in result_bcolz], ref)
def test_where_terms00(self): """ test_where_terms00: get terms in one column bigger than a certain value """ # expected result ref_data = np.fromiter(((x > 10000) for x in range(20000)), dtype='bool') ref_result = bquery.carray(ref_data) # generate data to filter on iterable = ((x, x) for x in range(20000)) data = np.fromiter(iterable, dtype='i8,i8') # filter data terms_filter = [('f0', '>', 10000)] ct = bquery.ctable(data, rootdir=self.rootdir) result = ct.where_terms(terms_filter) # compare assert_array_equal(result, ref_result)
def test_where_terms_04(self): """ test_where_terms04: get mask where terms in list with only one item """ include = [0] # expected result mask = np.zeros(20000, dtype=bool) mask[include] = True # generate data to filter on iterable = ((x, x) for x in range(20000)) data = np.fromiter(iterable, dtype='i8,i8') # filter data terms_filter = [('f0', 'in', include)] ct = bquery.ctable(data, rootdir=self.rootdir) result = ct.where_terms(terms_filter) assert_array_equal(result, mask)
def test_where_terms02(self): """ test_where_terms02: get mask where terms not in list """ exclude = [0, 1, 2, 3, 11, 12, 13] # expected result mask = np.ones(20000, dtype=bool) mask[exclude] = False # generate data to filter on iterable = ((x, x) for x in range(20000)) data = np.fromiter(iterable, dtype='i8,i8') # filter data terms_filter = [('f0', 'not in', exclude)] ct = bquery.ctable(data, rootdir=self.rootdir) result = ct.where_terms(terms_filter) assert_array_equal(result, mask)
def test_factorize_groupby_cols_01(self): """ test_factorize_groupby_cols_01: """ ref_fact_table = np.arange(20000) % 5 ref_fact_groups = np.arange(5) # generate data iterable = ((x, x % 5) for x in range(20000)) data = np.fromiter(iterable, dtype='i8,i8') ct = bquery.ctable(data, rootdir=self.rootdir) # factorize - check the only factirized col. [0] fact_1 = ct.factorize_groupby_cols(['f1']) # cache should be used this time fact_2 = ct.factorize_groupby_cols(['f1']) assert_array_equal(ref_fact_table, fact_1[0][0]) assert_array_equal(ref_fact_groups, fact_1[1][0]) assert_array_equal(fact_1[0][0], fact_2[0][0]) assert_array_equal(fact_1[1][0], fact_2[1][0])
def test_factorize_groupby_cols_01(self): """ test_factorize_groupby_cols_01: """ ref_fact_table = np.arange(20000) % 5 ref_fact_groups = np.arange(5) # generate data iterable = ((x, x % 5) for x in range(20000)) data = np.fromiter(iterable, dtype='i8,i8') ct = bquery.ctable(data, rootdir=tempfile.mkdtemp(prefix='bcolz-'), mode='w') # factorize - check the only factirized col. [0] fact_1 = ct.factorize_groupby_cols(['f1']) # cache should be used this time fact_2 = ct.factorize_groupby_cols(['f1']) assert_array_equal(ref_fact_table, fact_1[0][0]) assert_array_equal(ref_fact_groups, fact_1[1][0]) assert_array_equal(fact_1[0][0], fact_2[0][0]) assert_array_equal(fact_1[1][0], fact_2[1][0])
def test_groupby_02(self): """ test_groupby_02: Test groupby's group creation (groupby over multiple rows results into multiple groups) """ random.seed(1) groupby_cols = ['f0', 'f1', 'f2'] groupby_lambda = lambda x: [x[0], x[1], x[2]] # no operation is specified in `agg_list`, so `sum` is used by default. agg_list = ['f4', 'f5', 'f6'] num_rows = 2000 # -- Data -- g = self.gen_almost_unique_row(num_rows) data = np.fromiter(g, dtype='S1,f8,i8,i4,f8,i8,i4') # -- Bcolz -- print('--> Bcolz') self.rootdir = tempfile.mkdtemp(prefix='bcolz-') os.rmdir(self.rootdir) # folder should be emtpy fact_bcolz = bquery.ctable(data, rootdir=self.rootdir) fact_bcolz.flush() fact_bcolz.cache_factor(groupby_cols, refresh=True) result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list) print(result_bcolz) # Itertools result print('--> Itertools') result_itt = self.helper_itt_groupby(data, groupby_lambda) uniquekeys = result_itt['uniquekeys'] print(uniquekeys) assert_list_equal( sorted([list(x) for x in result_bcolz[groupby_cols]]), sorted(uniquekeys))
def create_bcolz_chunks(workdir): file_list = sorted(glob.glob(workdir + 'yellow_tripdata_*.csv')) if not file_list: raise ValueError('No Files Found') for i, filename in enumerate(file_list): print(filename) rootdir = workdir + 'taxi_' + str(i) import_df = pd.read_csv(filename) # lower columns because of input inconsistencies import_df.columns = [x.lower() for x in import_df.columns] import_df.columns = [x.strip() for x in import_df.columns] import_df.columns = [x.replace('tpep_', '') for x in import_df.columns] import_df['nr_rides'] = 1 import_df['pickup_date'] = import_df['pickup_datetime'].str[0:10] import_df['pickup_date'] = import_df['pickup_date'].str.replace('-', '') import_df['pickup_year'] = import_df['pickup_date'].str[0:4].astype(int) import_df['pickup_yearmonth'] = import_df['pickup_date'].str[0:6].astype(int) import_df['pickup_month'] = import_df['pickup_date'].str[4:6].astype(int) import_df['pickup_date'] = import_df['pickup_date'].astype(int) import_df['pickup_time'] = import_df['pickup_datetime'].str[11:] import_df['pickup_time'] = import_df['pickup_time'].str.replace(':', '') import_df['pickup_hour'] = import_df['pickup_time'].str[0:2].astype(int) import_df['pickup_time'] = import_df['pickup_time'].astype(int) del import_df['pickup_datetime'] import_df['dropoff_date'] = import_df['dropoff_datetime'].str[0:10] import_df['dropoff_date'] = import_df['dropoff_date'].str.replace('-', '') import_df['dropoff_year'] = import_df['dropoff_date'].str[0:4].astype(int) import_df['dropoff_yearmonth'] = import_df['dropoff_date'].str[0:6].astype(int) import_df['dropoff_month'] = import_df['dropoff_date'].str[4:6].astype(int) import_df['dropoff_date'] = import_df['dropoff_date'].astype(int) import_df['dropoff_time'] = import_df['dropoff_datetime'].str[11:] import_df['dropoff_time'] = import_df['dropoff_time'].str.replace(':', '') import_df['dropoff_hour'] = import_df['dropoff_time'].str[0:2].astype(int) import_df['dropoff_time'] = import_df['dropoff_time'].astype(int) del import_df['dropoff_datetime'] import_ct = ctable.fromdataframe(import_df, rootdir=rootdir, expectedlen=len(import_df), mode='w') del import_df import_ct.flush() import_ct = ctable(rootdir=rootdir, mode='a') import_ct.cache_factor([ 'dropoff_date', 'dropoff_hour', 'dropoff_latitude', 'dropoff_longitude', 'dropoff_month', 'dropoff_time', 'dropoff_year', 'dropoff_yearmonth', 'payment_type', 'pickup_date', 'pickup_hour', 'pickup_latitude', 'pickup_longitude', 'pickup_month', 'pickup_time', 'pickup_year', 'pickup_yearmonth', 'ratecodeid', 'store_and_fwd_flag', 'vendorid'])
def handle_work(self, msg): if msg.isa('execute_code'): return self.execute_code(msg) tmp_dir = tempfile.mkdtemp(prefix='result_') buf_file_fd, buf_file = tempfile.mkstemp(prefix='tar_') os.close(buf_file_fd) args, kwargs = msg.get_args_kwargs() self.logger.info('doing calc %s' % args) filename = args[0] groupby_col_list = args[1] aggregation_list = args[2] where_terms_list = args[3] expand_filter_column = kwargs.get('expand_filter_column') aggregate = kwargs.get('aggregate', True) # create rootdir rootdir = os.path.join(self.data_dir, filename) if not os.path.exists(rootdir): raise Exception('Path %s does not exist' % rootdir) ct = bquery.ctable(rootdir=rootdir, mode='r', auto_cache=True) # prepare filter if not where_terms_list: bool_arr = None else: # quickly verify the where_terms_list if not ct.where_terms_factorization_check(where_terms_list): # return an empty result because the where terms do not give a result for this ctable msg['data'] = '' return msg # else create the boolean array bool_arr = ct.where_terms(where_terms_list, cache=True) # expand filter column check if expand_filter_column: bool_arr = ct.is_in_ordered_subgroups(basket_col=expand_filter_column, bool_arr=bool_arr) # retrieve & aggregate if needed rm_file_or_dir(tmp_dir) if aggregate: # aggregate by groupby parameters result_ctable = ct.groupby(groupby_col_list, aggregation_list, bool_arr=bool_arr, rootdir=tmp_dir) else: # direct result from the ctable column_list = groupby_col_list + [x[0] for x in aggregation_list] if bool_arr is not None: result_ctable = bcolz.fromiter(ct[column_list].where(bool_arr), ct[column_list].dtype, sum(bool_arr), rootdir=tmp_dir, mode='w') else: result_ctable = bcolz.fromiter(ct[column_list], ct[column_list].dtype, ct.len, rootdir=tmp_dir, mode='w') # *** clean up temporary files and memory objects # filter del bool_arr # input ct.free_cachemem() ct.clean_tmp_rootdir() del ct # save result to archive result_ctable.flush() result_ctable.free_cachemem() with tarfile.open(buf_file, mode='w') as archive: archive.add(tmp_dir, arcname=os.path.basename(tmp_dir)) del result_ctable rm_file_or_dir(tmp_dir) # create message with open(buf_file, 'r') as file: # add result to message msg['data'] = file.read() rm_file_or_dir(buf_file) return msg
ga = itt.cycle(['ES', 'NL']) gb = itt.cycle(['b1', 'b2', 'b3', 'b4', 'b5']) gx = itt.cycle([1, 2]) gy = itt.cycle([-1, -2]) rootdir = 'bench-data.bcolz' if os.path.exists(rootdir): shutil.rmtree(rootdir) n_rows = 1000000 print('Rows: ', n_rows) # -- data z = np.fromiter(((a, b, x, y) for a, b, x, y in izip(ga, gb, gx, gy)), dtype='S2,S2,i8,i8', count=n_rows) ct = bquery.ctable(z, rootdir=rootdir, ) print(ct) # -- pandas -- df = pd.DataFrame(z) with ctime(message='pandas'): result = df.groupby(['f0'])['f2'].sum() print(result) t_pandas = t_elapsed # -- cytoolz -- with ctime(message='cytoolz over bcolz'): # In Memory Split-Apply-Combine # http://toolz.readthedocs.org/en/latest/streaming-analytics.html?highlight=reduce#split-apply-combine-with-groupby-and-reduceby r = cytoolz.groupby(lambda row: row.f0, ct) result = valmap(compose(sum, pluck(2)), r)
gx = itt.cycle([1, 2]) gy = itt.cycle([-1, -2]) rootdir = 'bench-data.bcolz' if os.path.exists(rootdir): shutil.rmtree(rootdir) n_rows = 1000000 print('Rows: ', n_rows) # -- data z = np.fromiter(((a, b, x, y) for a, b, x, y in izip(ga, gb, gx, gy)), dtype='S2,S2,i8,i8', count=n_rows) ct = bquery.ctable( z, rootdir=rootdir, ) print(ct) # -- pandas -- df = pd.DataFrame(z) with ctime(message='pandas'): result = df.groupby(['f0'])['f2'].sum() print(result) t_pandas = t_elapsed # -- cytoolz -- with ctime(message='cytoolz over bcolz'): # In Memory Split-Apply-Combine # http://toolz.readthedocs.org/en/latest/streaming-analytics.html?highlight=reduce#split-apply-combine-with-groupby-and-reduceby r = cytoolz.groupby(lambda row: row.f0, ct)