def unique(self, col_or_col_list): """ Return a list of unique values of a column or a list of lists of column list :param col_or_col_list: a column or a list of columns :return: """ if isinstance(col_or_col_list, list): col_is_list = True col_list = col_or_col_list else: col_is_list = False col_list = [col_or_col_list] output = [] for col in col_list: if self.cache_valid(col): # retrieve values from existing disk-based factorization col_values_rootdir = self[col].rootdir + '.values' carray_values = bcolz.carray(rootdir=col_values_rootdir, mode='r') values = list(carray_values) else: # factorize on-the-fly _, values = ctable_ext.factorize(self[col]) values = values.values() output.append(values) if not col_is_list: output = output[0] return output
def factorize_groupby_cols(self, groupby_cols): """ :type self: ctable """ # first check if the factorized arrays already exist # unless we need to refresh the cache factor_list = [] values_list = [] # factorize the groupby columns for col in groupby_cols: if self.cache_valid(col): col_rootdir = self[col].rootdir col_factor_rootdir = col_rootdir + '.factor' col_values_rootdir = col_rootdir + '.values' col_factor_carray = \ bcolz.carray(rootdir=col_factor_rootdir, mode='r') col_values_carray = \ bcolz.carray(rootdir=col_values_rootdir, mode='r') else: col_factor_carray, values = ctable_ext.factorize(self[col]) col_values_carray = \ bcolz.carray(np.fromiter(values.values(), dtype=self[col].dtype)) factor_list.append(col_factor_carray) values_list.append(col_values_carray) return factor_list, values_list
def cache_factor(self, col_list, refresh=False): """ Existing todos here are: these should be hidden helper carrays As in: not normal columns that you would normally see as a user The factor (label index) carray is as long as the original carray (and the rest of the table therefore) But the (unique) values carray is not as long (as long as the number of unique values) :param col_list: :param refresh: :return: """ if not self.rootdir: raise TypeError('Only out-of-core ctables can have ' 'factorization caching at the moment') if not isinstance(col_list, list): col_list = [col_list] if refresh: kill_list = [x for x in os.listdir(self.rootdir) if '.factor' in x or '.values' in x] for kill_dir in kill_list: rm_file_or_dir(os.path.join(self.rootdir, kill_dir)) for col in col_list: # create cache if needed if refresh or not self.cache_valid(col): # todo: also add locking mechanism here # create directories col_rootdir = self[col].rootdir col_factor_rootdir = col_rootdir + '.factor' col_factor_rootdir_tmp = tempfile.mkdtemp(prefix='bcolz-') col_values_rootdir = col_rootdir + '.values' col_values_rootdir_tmp = tempfile.mkdtemp(prefix='bcolz-') # create factor carray_factor = \ bcolz.carray([], dtype='int64', expectedlen=self.size, rootdir=col_factor_rootdir_tmp, mode='w') _, values = \ ctable_ext.factorize(self[col], labels=carray_factor) carray_factor.flush() rm_file_or_dir(col_factor_rootdir, ignore_errors=True) shutil.move(col_factor_rootdir_tmp, col_factor_rootdir) # create values carray_values = \ bcolz.carray(np.fromiter(values.values(), dtype=self[col].dtype), rootdir=col_values_rootdir_tmp, mode='w') carray_values.flush() rm_file_or_dir(col_values_rootdir, ignore_errors=True) shutil.move(col_values_rootdir_tmp, col_values_rootdir) else: rm_file_or_dir(col_factor_rootdir_tmp, ignore_errors=True)
def _calc_group_index(eval_list, factor_set, vm=None): factorize_list = [] for eval_node in eval_list: # calculate the cartesian group index for each row factor_input = bcolz.eval(eval_node[0], user_dict=factor_set, vm=vm) # now factorize the unique groupby combinations sub_factor_carray, sub_values = ctable_ext.factorize(factor_input) factorize_list.append((sub_factor_carray, sub_values)) return factorize_list
def cache_factor(self, col_list, refresh=False): """ Existing todos here are: these should be hidden helper carrays As in: not normal columns that you would normally see as a user The factor (label index) carray is as long as the original carray (and the rest of the table therefore) But the (unique) values carray is not as long (as long as the number of unique values) :param col_list: :param refresh: :return: """ if not self.rootdir: raise TypeError('Only out-of-core ctables can have ' 'factorization caching at the moment') if not isinstance(col_list, list): col_list = [col_list] for col in col_list: # create cache if needed if refresh or not self.cache_valid(col): col_rootdir = self[col].rootdir col_factor_rootdir = col_rootdir + '.factor' col_values_rootdir = col_rootdir + '.values' carray_factor = \ bcolz.carray([], dtype='int64', expectedlen=self.size, rootdir=col_factor_rootdir, mode='w') _, values = \ ctable_ext.factorize(self[col], labels=carray_factor) carray_factor.flush() carray_values = \ bcolz.carray(np.fromiter(values.values(), dtype=self[col].dtype), rootdir=col_values_rootdir, mode='w') carray_values.flush()
def unique(self, col_or_col_list): """ Return a list of unique values of a column or a list of lists of column list :param col_or_col_list: a column or a list of columns :return: """ if isinstance(col_or_col_list, list): col_is_list = True col_list = col_or_col_list else: col_is_list = False col_list = [col_or_col_list] output = [] for col in col_list: if self.auto_cache or self.cache_valid(col): # create factorization cache if not self.cache_valid(col): self.cache_factor([col]) # retrieve values from existing disk-based factorization col_values_rootdir = self[col].rootdir + '.values' carray_values = bcolz.carray(rootdir=col_values_rootdir, mode='r') values = list(carray_values) else: # factorize on-the-fly _, values = ctable_ext.factorize(self[col]) values = values.values() output.append(values) if not col_is_list: output = output[0] return output
def factorize_groupby_cols(self, groupby_cols): """ factorizes all columns that are used in the groupby it will use cache carrays if available if not yet auto_cache is valid, it will create cache carrays """ # first check if the factorized arrays already exist # unless we need to refresh the cache factor_list = [] values_list = [] # factorize the groupby columns for col in groupby_cols: if self.auto_cache or self.cache_valid(col): # create factorization cache if needed if not self.cache_valid(col): self.cache_factor([col]) col_rootdir = self[col].rootdir col_factor_rootdir = col_rootdir + '.factor' col_values_rootdir = col_rootdir + '.values' col_carray_factor = \ bcolz.carray(rootdir=col_factor_rootdir, mode='r') col_carray_values = \ bcolz.carray(rootdir=col_values_rootdir, mode='r') else: col_carray_factor, values = ctable_ext.factorize(self[col]) col_carray_values = \ bcolz.carray(np.fromiter(values.values(), dtype=self[col].dtype)) factor_list.append(col_carray_factor) values_list.append(col_carray_values) return factor_list, values_list
def make_group_index(self, groupby_cols, bool_arr): '''Create unique groups for groupby loop Args: factor_list: values_list: groupby_cols: bool_arr: Returns: carray: (carray_factor) int: (nr_groups) the number of resulting groups int: (skip_key) ''' factor_list, values_list = self.factorize_groupby_cols(groupby_cols) # create unique groups for groupby loop if len(factor_list) == 0: # no columns to groupby over, so directly aggregate the measure # columns to 1 total tmp_rootdir = self.create_tmp_rootdir() carray_factor = bcolz.zeros(len(self), dtype='int64', rootdir=tmp_rootdir, mode='w') carray_values = ['Total'] elif len(factor_list) == 1: # single column groupby, the groupby output column # here is 1:1 to the values carray_factor = factor_list[0] carray_values = values_list[0] else: # multi column groupby # first combine the factorized columns to single values if self.group_cache_valid(col_list=groupby_cols): # there is a group cache that we can use col_rootdir = os.path.join(self.rootdir, self.create_group_base_name(groupby_cols)) col_factor_rootdir = col_rootdir + '.factor' carray_factor = bcolz.carray(rootdir=col_factor_rootdir) col_values_rootdir = col_rootdir + '.values' carray_values = bcolz.carray(rootdir=col_values_rootdir) else: # create a brand new groupby col combination carray_factor, carray_values = \ self.create_group_column_factor(factor_list, groupby_cols, cache=self.auto_cache) nr_groups = len(carray_values) skip_key = None if bool_arr is not None: # make all non relevant combinations -1 tmp_rootdir = self.create_tmp_rootdir() carray_factor = bcolz.eval( '(factor + 1) * bool - 1', user_dict={'factor': carray_factor, 'bool': bool_arr}, rootdir=tmp_rootdir, mode='w') # now check how many unique values there are left tmp_rootdir = self.create_tmp_rootdir() labels = bcolz.carray([], dtype='int64', expectedlen=len(carray_factor), rootdir=tmp_rootdir, mode='w') carray_factor, values = ctable_ext.factorize(carray_factor, labels) # values might contain one value too much (-1) (no direct lookup # possible because values is a reversed dict) filter_check = \ [key for key, value in values.items() if value == -1] if filter_check: skip_key = filter_check[0] # the new nr of groups depends on the outcome after filtering nr_groups = len(values) # using nr_groups as a total length might be one one off due to the skip_key # (skipping a row in aggregation) # but that is okay normally if skip_key is None: # if we shouldn't skip a row, set it at the first row after the total number of groups skip_key = nr_groups return carray_factor, nr_groups, skip_key
def create_group_column_factor(self, factor_list, groupby_cols, cache=False): """ Create a unique, factorized column out of several individual columns Parameters ---------- factor_list groupby_cols cache Returns ------- """ if not self.rootdir: # in-memory scenario input_rootdir = None col_rootdir = None col_factor_rootdir = None col_values_rootdir = None col_factor_rootdir_tmp = None col_values_rootdir_tmp = None else: # temporary input_rootdir = tempfile.mkdtemp(prefix='bcolz-') col_factor_rootdir_tmp = tempfile.mkdtemp(prefix='bcolz-') col_values_rootdir_tmp = tempfile.mkdtemp(prefix='bcolz-') # create combination of groupby columns group_array = bcolz.zeros(0, dtype=np.int64, expectedlen=len(self), rootdir=input_rootdir, mode='w') factor_table = bcolz.ctable(factor_list, names=groupby_cols) ctable_iter = factor_table.iter(outcols=groupby_cols, out_flavor=tuple) ctable_ext.create_group_index(ctable_iter, len(groupby_cols), group_array) # now factorize the results carray_factor = \ bcolz.carray([], dtype='int64', expectedlen=self.size, rootdir=col_factor_rootdir_tmp, mode='w') carray_factor, values = ctable_ext.factorize(group_array, labels=carray_factor) carray_factor.flush() carray_values = \ bcolz.carray(np.fromiter(values.values(), dtype=np.int64), rootdir=col_values_rootdir_tmp, mode='w') carray_values.flush() del group_array if cache: # clean up the temporary file rm_file_or_dir(input_rootdir, ignore_errors=True) if cache: # official end destination col_rootdir = os.path.join(self.rootdir, self.create_group_base_name(groupby_cols)) col_factor_rootdir = col_rootdir + '.factor' col_values_rootdir = col_rootdir + '.values' lock_file = col_rootdir + '.lock' # only works for linux if not os.path.exists(lock_file): uid = str(uuid.uuid4()) try: with open(lock_file, 'a+') as fn: fn.write(uid + '\n') with open(lock_file, 'r') as fn: temp = fn.read().splitlines() if temp[0] == uid: lock = True else: lock = False del temp except: lock = False else: lock = False if lock: rm_file_or_dir(col_factor_rootdir, ignore_errors=False) shutil.move(col_factor_rootdir_tmp, col_factor_rootdir) carray_factor = bcolz.carray(rootdir=col_factor_rootdir, mode='r') rm_file_or_dir(col_values_rootdir, ignore_errors=False) shutil.move(col_values_rootdir_tmp, col_values_rootdir) carray_values = bcolz.carray(rootdir=col_values_rootdir, mode='r') else: # another process has a lock, we will work with our current files and clean up later self._dir_clean_list.append(col_factor_rootdir) self._dir_clean_list.append(col_values_rootdir) return carray_factor, carray_values
def make_group_index(self, factor_list, values_list, groupby_cols, array_length, bool_arr): '''Create unique groups for groupby loop Args: factor_list: values_list: groupby_cols: array_length: bool_arr: Returns: carray: (factor_carray) int: (nr_groups) the number of resulting groups int: (skip_key) ''' def _create_eval_str(groupby_cols, values_list, check_overflow=True): eval_list = [] eval_str = '' col_list = [] previous_value = 1 # Sort evaluated columns by length col_len_list = [(col, values) for col, values in zip(groupby_cols, values_list)] col_len_list.sort(key=lambda x: len(x[1])) groupby_cols = [col for col, _ in col_len_list] values_list = [values for _, values in col_len_list] for col, values \ in zip(groupby_cols, values_list): # check for overflow if check_overflow: if previous_value * len(values) > 4294967295: eval_list.append((eval_str, col_list)) # reset eval_str = '' col_list = [] previous_value = 1 if eval_str: eval_str += ' + ' else: eval_str += '-2147483648 + ' eval_str += str(previous_value) + '*' + col col_list.append(col) previous_value *= len(values) eval_list.append((eval_str, col_list)) return eval_list def _calc_group_index(eval_list, factor_set, vm=None): factorize_list = [] for eval_node in eval_list: # calculate the cartesian group index for each row factor_input = bcolz.eval(eval_node[0], user_dict=factor_set, vm=vm) # now factorize the unique groupby combinations sub_factor_carray, sub_values = ctable_ext.factorize(factor_input) factorize_list.append((sub_factor_carray, sub_values)) return factorize_list def _is_reducible(eval_list): for eval_node in eval_list: if len(eval_node[1]) > 1: return True return False def calc_index(groupby_cols, values_list, factor_set, vm=None): # Initialize eval list eval_list = _create_eval_str(groupby_cols, values_list) # Reduce expression as possible while _is_reducible(eval_list): del groupby_cols del values_list factorize_list = _calc_group_index(eval_list, factor_set) factor_set = {'g' + str(i): x[0] for i, x in enumerate(factorize_list)} groupby_cols = ['g' + str(i) for i, x in enumerate(factorize_list)] values_list = [x[1] for i, x in enumerate(factorize_list)] eval_list = _create_eval_str(groupby_cols, values_list) # If we have multiple expressions that cannot be reduced anymore, rewrite as a single one and use Python vm if len(eval_list) > 1: eval_list = _create_eval_str(groupby_cols, values_list, check_overflow=False) vm = 'python' del groupby_cols del values_list # Now we have a single expression, factorize it return _calc_group_index(eval_list, factor_set, vm=vm)[0] # create unique groups for groupby loop if len(factor_list) == 0: # no columns to groupby over, so directly aggregate the measure # columns to 1 total (index 0/zero) factor_carray = bcolz.zeros(array_length, dtype='int64') values = ['Total'] elif len(factor_list) == 1: # single column groupby, the groupby output column # here is 1:1 to the values factor_carray = factor_list[0] values = values_list[0] else: # multi column groupby # nb: this might also be cached in the future # first combine the factorized columns to single values factor_set = {x: y for x, y in zip(groupby_cols, factor_list)} # create a numexpr expression that calculates the place on # a cartesian join index factor_carray, values = calc_index(groupby_cols, values_list, factor_set) skip_key = None if bool_arr is not None: # make all non relevant combinations -1 factor_carray = bcolz.eval( '(factor + 1) * bool - 1', user_dict={'factor': factor_carray, 'bool': bool_arr}) # now check how many unique values there are left factor_carray, values = ctable_ext.factorize(factor_carray) # values might contain one value too much (-1) (no direct lookup # possible because values is a reversed dict) filter_check = \ [key for key, value in values.items() if value == -1] if filter_check: skip_key = filter_check[0] # using nr_groups as a total length might be one one off due to the skip_key # (skipping a row in aggregation) # but that is okay normally nr_groups = len(values) if skip_key is None: # if we shouldn't skip a row, set it at the first row after the total number of groups skip_key = nr_groups return factor_carray, nr_groups, skip_key