def match_one_set1_individual_pool(idx, sorted_idx, pool_size): global local_ctx set2_size = context_length(local_ctx) if not set2_size: raise StopIteration if set2_size > pool_size: pool = random.sample(xrange(context_length(local_ctx)), pool_size) else: pool = range(set2_size) sub_local_ctx = context_subset(local_ctx, pool, None) sub_local_ctx.update((k, set1[k][sorted_idx]) for k in ['id'] + used_variables1) set2_scores = expr_eval(score_expr, sub_local_ctx) individual2_pool_idx = np.argmax(set2_scores) individual2_idx = pool[individual2_pool_idx] id1 = sub_local_ctx['id'] id2 = local_ctx['__other_id'][individual2_idx] local_ctx = context_delete(local_ctx, individual2_idx) result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1
def evaluate(self, context): ctx_filter = context.get('__filter__') id_to_rownum = context.id_to_rownum # at some point ctx_filter will be cached automatically, so we don't # need to take care of it manually here if ctx_filter is not None: set1filter = expr_eval(ctx_filter & self.set1filter, context) set2filter = expr_eval(ctx_filter & self.set2filter, context) else: set1filter = expr_eval(self.set1filter, context) set2filter = expr_eval(self.set2filter, context) rank1_expr = self.rank1_expr rank2_expr = self.rank2_expr used_variables1 = rank1_expr.collect_variables(context) used_variables2 = rank2_expr.collect_variables(context) used_variables1.add('id') used_variables2.add('id') set1 = context_subset(context, set1filter, used_variables1) set2 = context_subset(context, set2filter, used_variables2) set1len = set1filter.sum() set2len = set2filter.sum() tomatch = min(set1len, set2len) order1 = expr_eval(rank1_expr, context) order2 = expr_eval(rank2_expr, context) if not self.ascending1: order1 = - order1 # reverse sorting if not self.ascending2: order2 = - order2 # reverse sorting sorted_set1_indices = order1[set1filter].argsort() sorted_set2_indices = order2[set2filter].argsort() idx1 = sorted_set1_indices[:tomatch] idx2 = sorted_set2_indices[:tomatch] print("matching with %d/%d individuals" % (set1len, set2len)) result = np.empty(context_length(context), dtype=int) result.fill(-1) id1 = set1['id'][idx1] id2 = set2['id'][idx2] result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1 return result
def evaluate(self, context): expr = self.expr expr_vars = collect_variables(expr, context) expressions = self.expressions labels = [str(e) for e in expressions] columns = [expr_eval(e, context) for e in expressions] if self.filter is not None: filter_value = expr_eval(self.filter, context) #TODO: make a function out of this, I think we have this pattern # in several places filtered_columns = [col[filter_value] if isinstance(col, np.ndarray) and col.shape else [col] for col in columns] filtered_context = context_subset(context, filter_value, expr_vars) else: filtered_columns = columns filtered_context = context possible_values = self.pvalues if possible_values is None: possible_values = [np.unique(col) for col in filtered_columns] # We pre-filtered columns instead of passing the filter to partition_nd # because it is a bit faster this way. The indices are still correct, # because we use them on a filtered_context. groups = partition_nd(filtered_columns, True, possible_values) if not groups: return LabeledArray([], labels, possible_values) # evaluate the expression on each group data = [expr_eval(expr, context_subset(filtered_context, indices, expr_vars)) for indices in groups] #TODO: use group_indices_nd directly to avoid using np.unique # this is twice as fast (unique is very slow) but breaks because # the rest of the code assumes all combinations are present # if self.filter is not None: # filter_value = expr_eval(self.filter, context) # else: # filter_value = True # # d = group_indices_nd(columns, filter_value) # pvalues = sorted(d.keys()) # ndim = len(columns) # possible_values = [[pv[i] for pv in pvalues] # for i in range(ndim)] # groups = [d[k] for k in pvalues] # groups is a (flat) list of list. # the first variable is the outer-most "loop", # the last one the inner most. # add total for each row len_pvalues = [len(vals) for vals in possible_values] width = len_pvalues[-1] height = prod(len_pvalues[:-1]) rows_indices = [np.concatenate([groups[y * width + x] for x in range(width)]) for y in range(height)] cols_indices = [np.concatenate([groups[y * width + x] for y in range(height)]) for x in range(width)] cols_indices.append(np.concatenate(cols_indices)) # evaluate the expression on each "combined" group (ie compute totals) row_totals = [expr_eval(expr, context_subset(filtered_context, inds, expr_vars)) for inds in rows_indices] col_totals = [expr_eval(expr, context_subset(filtered_context, inds, expr_vars)) for inds in cols_indices] if self.percent: # convert to np.float64 to get +-inf if total_value is int(0) # instead of Python's built-in behaviour of raising an exception. # This can happen at least when using the default expr (count()) # and the filter yields empty groups total_value = np.float64(col_totals[-1]) data = [100.0 * value / total_value for value in data] row_totals = [100.0 * value / total_value for value in row_totals] col_totals = [100.0 * value / total_value for value in col_totals] # if self.by or self.percent: # if self.percent: # total_value = data[-1] # divisors = [total_value for _ in data] # else: # num_by = len(self.by) # inc = prod(len_pvalues[-num_by:]) # num_groups = len(groups) # num_categories = prod(len_pvalues[:-num_by]) # # categories_groups_idx = [range(cat_idx, num_groups, inc) # for cat_idx in range(num_categories)] # # divisors = ... # # data = [100.0 * value / divisor # for value, divisor in izip(data, divisors)] # convert to a 1d array. We don't simply use data = np.array(data), # because if data is a list of ndarray (for example if we use # groupby(a, expr=id), *and* all the ndarrays have the same length, # the result is a 2d array instead of an array of ndarrays like we # need (at this point). arr = np.empty(len(data), dtype=type(data[0])) arr[:] = data data = arr # and reshape it data = data.reshape(len_pvalues) return LabeledArray(data, labels, possible_values, row_totals, col_totals)
def evaluate(self, context): source_entity = context['__entity__'] if self.entity_name is None: target_entity = source_entity else: target_entity = entity_registry[self.entity_name] if target_entity is source_entity: target_context = context else: target_context = EntityContext(target_entity, {'period': context['period']}) ctx_filter = context.get('__filter__') if self.filter is not None and ctx_filter is not None: filter_expr = ctx_filter & self.filter elif self.filter is not None: filter_expr = self.filter elif ctx_filter is not None: filter_expr = ctx_filter else: filter_expr = None if filter_expr is not None: to_give_birth = expr_eval(filter_expr, context) num_birth = to_give_birth.sum() elif self.number is not None: to_give_birth = None num_birth = self.number else: raise Exception('no filter nor number in "new"') array = target_entity.array id_to_rownum = target_entity.id_to_rownum num_individuals = len(id_to_rownum) children = self._initial_values(array, to_give_birth, num_birth) # select real duplication case if self.num_duplicate is not None: number_rep = array[self.num_duplicate].compress( array[self.num_duplicate]>0 ) children = children.repeat(number_rep,axis=0) num_birth = number_rep.sum() if self.expand==True: from numpy.lib.stride_tricks import as_strided id_add = np.arange(number_rep.max()) id_add = as_strided(id_add , shape=number_rep.shape + id_add.shape, strides=(0,) + id_add.strides) id_add = id_add[id_add < number_rep[:, None]] one_by_house = array['res'].compress( array[self.num_duplicate]>0 ) # indices = np.unique(one_by_house) # size_by_id = np.bincount(one_by_house) # size_by_id = size_by_id.compress(size_by_id>0) # size_by_id = size_by_id.repeat(size_by_id) id_ini = one_by_house.repeat(number_rep,axis=0) decalage = np.zeros(len(one_by_house),dtype=int) indices = np.unique(one_by_house,return_index=True)[1] decalage[indices[1:]] = number_rep[indices] decalage = decalage.cumsum().repeat(number_rep,axis=0) # decalage = decalage - decalage[0] children['res'] = id_add+decalage+ array['res'].max()+1 remember_id = children['id'].copy() if num_birth: children['id'] = np.arange(num_individuals, num_individuals + num_birth) children['period'] = context['period'] used_variables = self._collect_kwargs_variables(context) child_context = context_subset(context, to_give_birth, used_variables) if to_give_birth is None: child_context = new_context_like(context, length=num_birth) else: child_context = context_subset(context, to_give_birth, used_variables) for k, v in self.kwargs.iteritems(): children[k] = expr_eval(v, child_context) if self.numerotation is not None: from numpy.lib.stride_tricks import as_strided initial = np.zeros(len(array), dtype=bool) id_dup = np.arange(number_rep.max()) id_dup = as_strided(id_dup , shape=number_rep.shape + id_dup.shape, strides=(0,) + id_dup.strides) id_dup = id_dup[id_dup < number_rep[:, None]] +1 children[self.numerotation] = id_dup add_individuals(target_context, children) # result is the ids of the new individuals corresponding to the source # entity # I change here to have the "father" name instead if to_give_birth is not None: if self.return_option is None: result = np.empty(context_length(context), dtype=int) result.fill(-1) # TODO: must change something to have father size correct with # target and not with source. if source_entity is target_entity: extra_bools = np.zeros(num_birth, dtype=bool) to_give_birth = np.concatenate((to_give_birth, extra_bools)) # Note that np.place is a tad faster, but is currently buggy when # working with columns of structured arrays. # See http://projects.scipy.org/numpy/ticket/1869 result[to_give_birth] = children['id'] return result elif self.return_option=='father' : father = np.empty(context_length(context), dtype=int) father.fill(-1) list_children = np.ones(num_birth, dtype=bool) initial = np.zeros(len(array), dtype=bool) birth = np.concatenate((initial, list_children)) father[birth] = remember_id return father else: return None
def evaluate(self, context): global local_ctx ctx_filter = context.get('__filter__') id_to_rownum = context.id_to_rownum # at some point ctx_filter will be cached automatically, so we don't # need to take care of it manually here if ctx_filter is not None: set1filter = expr_eval(ctx_filter & self.set1filter, context) set2filter = expr_eval(ctx_filter & self.set2filter, context) else: set1filter = expr_eval(self.set1filter, context) set2filter = expr_eval(self.set2filter, context) score_expr = self.score_expr used_variables = score_expr.collect_variables(context) used_variables1 = [v for v in used_variables if not v.startswith('__other_')] used_variables2 = [v[8:] for v in used_variables if v.startswith('__other_')] set1 = context_subset(context, set1filter, ['id'] + used_variables1) set2 = context_subset(context, set2filter, ['id'] + used_variables2) set1len = set1filter.sum() set2len = set2filter.sum() tomatch = min(set1len, set2len) orderby = self.orderby if not isinstance(orderby, str): order = expr_eval(orderby, context) else: order = np.zeros(context_length(context), dtype=int) if orderby == 'EDtM': for var in used_variables1: order[set1filter] += (set1[var] - set1[var].mean())**2/set1[var].var() if orderby == 'SDtOM': order_ctx = dict((k if k in used_variables1 else k, v) for k, v in set1.iteritems()) order_ctx.update(('__other_' + k, set2[k].mean()) for k in used_variables2) order[set1filter] = expr_eval(score_expr, order_ctx) sorted_set1_indices = order[set1filter].argsort()[::-1] set1tomatch = sorted_set1_indices[:tomatch] print("matching with %d/%d individuals" % (set1len, set2len)) #TODO: compute pk_names automatically: variables which are either # boolean, or have very few possible values and which are used more # than once in the expression and/or which are used in boolean # expressions # pk_names = ('eduach', 'work') # optimized_exprs = {} result = np.empty(context_length(context), dtype=int) result.fill(-1) local_ctx = dict(('__other_' + k if k in ['id'] + used_variables2 else k, v) for k, v in set2.iteritems()) if self.pool_size is None: #noinspection PyUnusedLocal def match_one_set1_individual(idx, sorted_idx): global local_ctx if not context_length(local_ctx): raise StopIteration local_ctx.update((k, set1[k][sorted_idx]) for k in ['id'] + used_variables1) # pk = tuple(individual1[fname] for fname in pk_names) # optimized_expr = optimized_exprs.get(pk) # if optimized_expr is None: # for name in pk_names: # fake_set1['__f_%s' % name].value = individual1[name] # optimized_expr = str(symbolic_expr.simplify()) # optimized_exprs[pk] = optimized_expr # set2_scores = evaluate(optimized_expr, mm_dict, set2) set2_scores = expr_eval(score_expr, local_ctx) individual2_idx = np.argmax(set2_scores) id1 = local_ctx['id'] id2 = local_ctx['__other_id'][individual2_idx] local_ctx = context_delete(local_ctx, individual2_idx) result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1 loop_wh_progress(match_one_set1_individual, set1tomatch) else: pool_size = self.pool_size #noinspection PyUnusedLocal def match_one_set1_individual_pool(idx, sorted_idx, pool_size): global local_ctx set2_size = context_length(local_ctx) if not set2_size: raise StopIteration if set2_size > pool_size: pool = random.sample(xrange(context_length(local_ctx)), pool_size) else: pool = range(set2_size) sub_local_ctx = context_subset(local_ctx, pool, None) sub_local_ctx.update((k, set1[k][sorted_idx]) for k in ['id'] + used_variables1) set2_scores = expr_eval(score_expr, sub_local_ctx) individual2_pool_idx = np.argmax(set2_scores) individual2_idx = pool[individual2_pool_idx] id1 = sub_local_ctx['id'] id2 = local_ctx['__other_id'][individual2_idx] local_ctx = context_delete(local_ctx, individual2_idx) result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1 loop_wh_progress(match_one_set1_individual_pool, set1tomatch, pool_size=10) return result
def evaluate(self, context): global local_ctx global cost ctx_filter = context.get('__filter__') id_to_rownum = context.id_to_rownum # at some point ctx_filter will be cached automatically, so we don't # need to take care of it manually here if ctx_filter is not None: set1filter = expr_eval(ctx_filter & self.set1filter, context) set2filter = expr_eval(ctx_filter & self.set2filter, context) else: set1filter = expr_eval(self.set1filter, context) set2filter = expr_eval(self.set2filter, context) score_expr = self.score_expr used_variables = score_expr.collect_variables(context) used_variables1 = ['id'] + [v for v in used_variables if not v.startswith('__other_')] used_variables2 = ['id'] + [v[8:] for v in used_variables if v.startswith('__other_')] set1 = context_subset(context, set1filter, used_variables1) set2 = context_subset(context, set2filter, used_variables2) orderby = expr_eval(self.orderby, context) sorted_set1_indices = orderby[set1filter].argsort()[::-1] print "matching with %d/%d individuals" % (set1filter.sum(), set2filter.sum()) #TODO: compute pk_names automatically: variables which are either # boolean, or have very few possible values and which are used more # than once in the expression and/or which are used in boolean # expressions # pk_names = ('eduach', 'work') # optimized_exprs = {} result = np.empty(context_length(context), dtype=int) result.fill(-1) local_ctx = dict(('__other_' + k if k in used_variables2 else k, v) for k, v in set2.iteritems()) # print local_ctx # test=local_ctx.copy() # test.update((k, set1[k]) for k in used_variables1) # # ######## Tentative de Munkres if self.option == "optimal": cost = [] def create_cost(idx, sorted_idx): global cost if not context_length(local_ctx): raise StopIteration local_ctx.update((k, set1[k][sorted_idx]) for k in used_variables1) set2_scores = expr_eval(score_expr, local_ctx) cost.append(set2_scores[:].tolist()) loop_wh_progress(create_cost, sorted_set1_indices) resultat = MunkresX.maxWeightMatching(cost) for id1,id2 in resultat.items(): result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1 return result else : def match_one_set1_individual(idx, sorted_idx): global local_ctx if not context_length(local_ctx): raise StopIteration local_ctx.update((k, set1[k][sorted_idx]) for k in used_variables1) set2_scores = expr_eval(score_expr, local_ctx) # print set2_scores individual2_idx = np.argmax(set2_scores) id1 = local_ctx['id'] id2 = local_ctx['__other_id'][individual2_idx] local_ctx = context_delete(local_ctx, individual2_idx) result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1 loop_wh_progress(match_one_set1_individual, sorted_set1_indices) return result
def match_cell(idx, sorted_idx, pool_size): global matching_ctx set2_size = context_length(matching_ctx) if not set2_size: raise StopIteration if pool_size is not None and set2_size > pool_size: pool = random.sample(xrange(set2_size), pool_size) local_ctx = context_subset(matching_ctx, pool) else: local_ctx = matching_ctx.copy() local_ctx.update((k, set1[k][sorted_idx]) for k in {'__ids__'} | used_variables1) eval_ctx = context.clone(entity_data=local_ctx) set2_scores = expr_eval(score, eval_ctx) cell2_idx = set2_scores.argmax() cell1ids = local_ctx['__ids__'] cell2ids = local_ctx['__other___ids__'][cell2_idx] if pool_size is not None and set2_size > pool_size: # transform pool-local index to set/matching_ctx index cell2_idx = pool[cell2_idx] cell1size = len(cell1ids) cell2size = len(cell2ids) nb_match = min(cell1size, cell2size) # we could introduce a random choice here but it is not # much necessary. In that case, it should be done in group_context ids1 = cell1ids[:nb_match] ids2 = cell2ids[:nb_match] result[id_to_rownum[ids1]] = ids2 result[id_to_rownum[ids2]] = ids1 if nb_match == cell2size: matching_ctx = context_delete(matching_ctx, cell2_idx) else: # other variables do not need to be modified since the cell # only got smaller and was not deleted matching_ctx['__other___ids__'][cell2_idx] = cell2ids[nb_match:] # FIXME: the expr gets cached for the full matching_ctx at the # beginning and then when another women with the same values is # found, it thinks it can reuse the expr but it breaks because it # has not the correct length. # the current workaround is to invalidate the whole cache for the # current entity but this is not the right way to go. # * disable the cache for matching? # * use a local cache so that methods after matching() can use # what was in the cache before matching(). Shouldn't the cache be # stored inside the context anyway? expr_cache.invalidate(context.period, context.entity_name) if nb_match < cell1size: set1['__ids__'][sorted_idx] = cell1ids[nb_match:] match_cell(idx, sorted_idx, pool_size)
def evaluate(self, context): global matching_ctx ctx_filter = context.get('__filter__') id_to_rownum = context.id_to_rownum # at some point ctx_filter will be cached automatically, so we don't # need to take care of it manually here if ctx_filter is not None: set1filter = expr_eval(ctx_filter & self.set1filter, context) set2filter = expr_eval(ctx_filter & self.set2filter, context) else: set1filter = expr_eval(self.set1filter, context) set2filter = expr_eval(self.set2filter, context) score_expr = self.score_expr used_variables = score_expr.collect_variables(context) used_variables1 = ['id'] + [v for v in used_variables if not v.startswith('__other_')] used_variables2 = ['id'] + [v[8:] for v in used_variables if v.startswith('__other_')] #TODO: we should detect whether or not we are using non-simple # expressions (EvaluableExpression children) and pre-evaluate them, # because otherwise they are re-evaluated on all of set2 for each # individual in set1. See https://github.com/liam2/liam2/issues/128 set1 = context_subset(context, set1filter, used_variables1) set2 = context_subset(context, set2filter, used_variables2) orderby = expr_eval(self.orderby, context) set1len = set1filter.sum() set2len = set2filter.sum() tomatch = min(set1len, set2len) sorted_set1_indices = orderby[set1filter].argsort()[::-1] set1tomatch = sorted_set1_indices[:tomatch] print("matching with %d/%d individuals" % (set1len, set2len)) #TODO: compute pk_names automatically: variables which are either # boolean, or have very few possible values and which are used more # than once in the expression and/or which are used in boolean # expressions # pk_names = ('eduach', 'work') # optimized_exprs = {} result = np.empty(context_length(context), dtype=int) result.fill(-1) matching_ctx = dict(('__other_' + k if k in used_variables2 else k, v) for k, v in set2.iteritems()) #noinspection PyUnusedLocal def match_one_set1_individual(idx, sorted_idx): global matching_ctx if not context_length(matching_ctx): raise StopIteration local_ctx = matching_ctx.copy() local_ctx.update((k, set1[k][sorted_idx]) for k in used_variables1) # pk = tuple(individual1[fname] for fname in pk_names) # optimized_expr = optimized_exprs.get(pk) # if optimized_expr is None: # for name in pk_names: # fake_set1['__f_%s' % name].value = individual1[name] # optimized_expr = str(symbolic_expr.simplify()) # optimized_exprs[pk] = optimized_expr # set2_scores = evaluate(optimized_expr, mm_dict, set2) set2_scores = expr_eval(score_expr, local_ctx) individual2_idx = np.argmax(set2_scores) id1 = local_ctx['id'] id2 = matching_ctx['__other_id'][individual2_idx] matching_ctx = context_delete(matching_ctx, individual2_idx) result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1 loop_wh_progress(match_one_set1_individual, set1tomatch, title="Matching...") return result
def evaluate(self, context): source_entity = context['__entity__'] if self.entity_name is None: target_entity = source_entity else: target_entity = entity_registry[self.entity_name] if target_entity is source_entity: target_context = context else: target_context = \ EntityContext(target_entity, {'period': context['period'], '__globals__': context['__globals__']}) ctx_filter = context.get('__filter__') if self.filter is not None and ctx_filter is not None: filter_expr = ctx_filter & self.filter elif self.filter is not None: filter_expr = self.filter elif ctx_filter is not None: filter_expr = ctx_filter else: filter_expr = None if filter_expr is not None: to_give_birth = expr_eval(filter_expr, context) num_birth = to_give_birth.sum() elif self.number is not None: to_give_birth = None num_birth = self.number else: raise Exception('no filter nor number in "new"') array = target_entity.array id_to_rownum = target_entity.id_to_rownum num_individuals = len(id_to_rownum) children = self._initial_values(array, to_give_birth, num_birth) if num_birth: children['id'] = np.arange(num_individuals, num_individuals + num_birth) children['period'] = context['period'] used_variables = self._collect_kwargs_variables(context) if to_give_birth is None: child_context = new_context_like(context, length=num_birth) else: child_context = context_subset(context, to_give_birth, used_variables) for k, v in self.kwargs.iteritems(): children[k] = expr_eval(v, child_context) add_individuals(target_context, children) # result is the ids of the new individuals corresponding to the source # entity if to_give_birth is not None: result = np.empty(context_length(context), dtype=int) result.fill(-1) if source_entity is target_entity: extra_bools = np.zeros(num_birth, dtype=bool) to_give_birth = np.concatenate((to_give_birth, extra_bools)) # Note that np.place is a bit faster, but is currently buggy when # working with columns of structured arrays. # See http://projects.scipy.org/numpy/ticket/1869 result[to_give_birth] = children['id'] return result else: return None
def evaluate(self, context): expressions = self.expressions columns = [expr_eval(e, context) for e in expressions] if self.filter is not None: filter_value = expr_eval(self.filter, context) # TODO: make a function out of this, I think we have this pattern # in several places filtered_columns = [ col[filter_value] if isinstance(col, np.ndarray) and col.shape else [col] for col in columns ] else: filtered_columns = columns possible_values = [np.unique(col) for col in filtered_columns] groups = partition_nd(filtered_columns, True, possible_values) # TODO: use group_indices_nd directly to avoid using np.unique # this is twice as fast (unique is very slow) but breaks because # the rest of the code assumes all combinations are present # if self.filter is not None: # filter_value = expr_eval(self.filter, context) # else: # filter_value = True # # d = group_indices_nd(columns, filter_value) # pvalues = sorted(d.keys()) # ndim = len(columns) # possible_values = [[pv[i] for pv in pvalues] # for i in range(ndim)] # groups = [d[k] for k in pvalues] # groups is a (flat) list of list. # the first variable is the outer-most "loop", # the last one the inner most. # add total for each row folded_exprs = len(expressions) - 1 len_pvalues = [len(vals) for vals in possible_values] width = len_pvalues[-1] height = prod(len_pvalues[:-1]) def xy_to_idx(x, y): # divide by the prod of possible values of expressions to its # right, mod by its own number of possible values offsets = [(y / prod(len_pvalues[v + 1 : folded_exprs])) % len_pvalues[v] for v in range(folded_exprs)] return sum(v * prod(len_pvalues[i + 1 :]) for i, v in enumerate(offsets)) + x groups_wh_totals = [] for y in range(height): line_indices = [] for x in range(width): member_indices = groups[xy_to_idx(x, y)] groups_wh_totals.append(member_indices) line_indices.extend(member_indices) groups_wh_totals.append(line_indices) # width just increased because of totals width += 1 # add total for each column (including the "total per row" one) for x in range(width): column_indices = [] for y in range(height): column_indices.extend(groups_wh_totals[y * width + x]) groups_wh_totals.append(column_indices) # evaluate the expression on each group expr = self.expr used_variables = expr.collect_variables(context) used_variables.add("id") data = [] for member_indices in groups_wh_totals: local_context = context_subset(context, member_indices, used_variables) data.append(expr_eval(expr, local_context)) if self.percent: # convert to np.float64 to get +-inf if total_value is int(0) # instead of Python's built-in behavior of raising an exception. # This can happen at least when using the default expr (grpcount()) # and the filter yields empty groups total_value = np.float64(data[-1]) data = [100.0 * value / total_value for value in data] # if self.by or self.percent: # if self.percent: # total_value = data[-1] # divisors = [total_value for _ in data] # else: # num_by = len(self.by) # inc = prod(len_pvalues[-num_by:]) # num_groups = len(groups) # num_categories = prod(len_pvalues[:-num_by]) # # categories_groups_idx = [range(cat_idx, num_groups, inc) # for cat_idx in range(num_categories)] # # divisors = ... # # data = [100.0 * value / divisor # for value, divisor in izip(data, divisors)] # gender | False | True | total # | 20 | 16 | 35 # gender | False | True | # dead | | | total # False | 20 | 15 | 35 # True | 0 | 1 | 1 # total | 20 | 16 | 36 # | dead | False | True | # agegroup | gender | | | total # 5 | False | 20 | 15 | xx # 5 | True | 0 | 1 | xx # 10 | False | 25 | 10 | xx # 10 | True | 1 | 1 | xx # | total | xx | xx | xx # add headers labels = [str(e) for e in expressions] if folded_exprs: result = [ [""] * (folded_exprs - 1) + [labels[-1]] + list(possible_values[-1]) + [""], # 2nd line labels[:-1] + [""] * len(possible_values[-1]) + ["total"], ] categ_values = list(product(*possible_values[:-1])) last_line = [""] * (folded_exprs - 1) + ["total"] categ_values.append(last_line) height += 1 else: # if there is only one expression, the headers are different result = [[labels[-1]] + list(possible_values[-1]) + ["total"]] categ_values = [[""]] for y in range(height): result.append(list(categ_values[y]) + data[y * width : (y + 1) * width]) return PrettyTable(result)