def compute(self, context, set1filter, set2filter, orderby1, orderby2): set1filterexpr = self._getfilter(context, set1filter) set1filtervalue = expr_eval(set1filterexpr, context) set2filterexpr = self._getfilter(context, set2filter) set2filtervalue = expr_eval(set2filterexpr, context) set1len = set1filtervalue.sum() set2len = set2filtervalue.sum() numtomatch = min(set1len, set2len) print("matching with %d/%d individuals" % (set1len, set2len)) result = np.full(context_length(context), -1, dtype=int) if not numtomatch: return result sorted_set1_indices = orderby1[set1filtervalue].argsort()[-numtomatch:] sorted_set2_indices = orderby2[set2filtervalue].argsort()[-numtomatch:] set1ids = context['id'][set1filtervalue] set2ids = context['id'][set2filtervalue] id_to_rownum = context.id_to_rownum id1 = set1ids[sorted_set1_indices] id2 = set2ids[sorted_set2_indices] # cannot use sorted_setX_indices because those are "local" indices result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1 return result
def align_no_link(self, context, score, need, filter, take, leave, expressions, possible_values, errors, frac_need, link, secondary_axis, method): ctx_length = context_length(context) need, expressions, possible_values = \ self._eval_need(context, need, expressions, possible_values) filter_value = expr_eval(self._getfilter(context, filter), context) if filter_value is not None: num_to_align = np.sum(filter_value) else: num_to_align = ctx_length # retrieve the columns we need to work with if expressions: columns = [expr_eval(expr, context) for expr in expressions] if filter_value is not None: groups = partition_nd(columns, filter_value, possible_values) else: groups = partition_nd(columns, True, possible_values) else: columns = [] if filter_value is not None: groups = [filter_to_indices(filter_value)] else: groups = [np.arange(num_to_align)] # the sum is not necessarily equal to len(a), because some individuals # might not fit in any group (eg if some alignment data is missing) if sum(len(g) for g in groups) < num_to_align: unaligned = np.ones(ctx_length, dtype=bool) if filter_value is not None: unaligned[~filter_value] = False for member_indices in groups: unaligned[member_indices] = False self._display_unaligned(expressions, context['id'], columns, unaligned) # noinspection PyAugmentAssignment need = need * self._get_need_correction(groups, possible_values) need = self._handle_frac_need(need, frac_need) need = self._add_past_error(context, need, errors) need = np.asarray(need) # FIXME: either handle past_error in no link (currently, the past # error is added... but never computed, so always 0 !) or raise # an error in case errors='carry" is used with no link. return align_get_indices_nd(ctx_length, groups, need, filter_value, score, take, leave, method)
def eval_rows(self, source_rows, expr_value, weights_value, context): fill = get_default_value(expr_value) result = np.full(context_length(context), fill, dtype=expr_value.dtype) id_sort_indices = np.argsort(source_rows) sorted_rownum = source_rows[id_sort_indices] sorted_values = expr_value[id_sort_indices] groups = groupby(zip(sorted_rownum, sorted_values), key=itemgetter(0)) aggregate_func = self.aggregate_func for rownum, values in groups: if rownum == -1: continue # Note that v[n] is faster than using an itemgetter, even with map result[rownum] = aggregate_func(v[1] for v in values) return result
def _eval_args(self, context): args, kwargs = NumpyCreateArray._eval_args(self, context) if 'size' in self.argspec.args: pos = self.argspec.args.index('size') size = args[pos] # The original functions return a scalar when size is None, and an # array of length one when size is 1. # TODO: users should have a way to have the "size=None" behavior. We # could differentiate whether None was explicitly passed or comes # from the default value (as we did previously: 'size' not in # kwargs), but I do not think it is a good idea. Adding a new # "sentinel" value (e.g. -1 or "scalar") is probably better. if size is None: args = args[:pos] + (context_length(context),) + args[pos + 1:] return args, kwargs
def compute(self, context, filter=None, weights=None): if filter is not None: filter = np.asarray(filter) # TODO: check this at "compile" time (in __init__), though for that we need to know the type of all # temporary variables first if not np.issubdtype(filter.dtype, np.bool_): raise ValueError("count filter must be a boolean expression") if weights is not None: weights = np.asarray(weights) if filter is None and weights is None: return context_length(context) elif weights is None: return np.sum(filter) elif filter is None: return np.sum(weights) else: return np.sum(weights * filter)
def eval_rows(self, source_rows, expr_value, weights_value, context): # We can't use a negative value because that is not allowed by # bincount, and using a value too high will uselessly increase the size # of the array returned by bincount idx_for_missing = context_length(context) missing_int = missing_values[int] # filter out missing values: those where the object pointed to does not # exist anymore (the id corresponds to -1 in id_to_rownum) # XXX: use np.putmask(source_rows, source_ids == missing_int, # missing_int) source_rows[source_rows == missing_int] = idx_for_missing # ideally this should be done with expr_eval in Aggregate.compute but then it is no longer generic # and I need to refactor the whole internal API if weights_value is not None: expr_value = expr_value * weights_value counts = self.count(source_rows, expr_value, weights_value) # missing entries are filled with zeros, so it works nicely in this case (sum/count) counts.resize(idx_for_missing) return counts
def fill_missing_values(ids, values, context, filler='auto'): """ ids: ids present in past period values: values in past period context: current period context """ if filler is 'auto': filler = get_default_value(values) result = np.full(context_length(context), filler, dtype=values.dtype) if len(ids): id_to_rownum = context.id_to_rownum # if there was more objects in the past than in the current # period. Currently, remove() keeps old ids, so this never # happens, but if we ever change remove(), we'll need to add # such a check everywhere we use id_to_rownum # invalid_ids = ids > len(id_to_rownum) # if np.any(invalid_ids): # fix ids rows = id_to_rownum[ids] safe_put(result, rows, values) return result
def match_cell(idx, sorted_idx, pool_size): global matching_ctx set2_size = context_length(matching_ctx) if not set2_size: raise StopIteration if pool_size is not None and set2_size > pool_size: pool = random.sample(range(set2_size), pool_size) local_ctx = context_subset(matching_ctx, pool) else: local_ctx = matching_ctx.copy() local_ctx.update((k, set1[k][sorted_idx]) for k in {'__ids__'} | used_variables1) eval_ctx = context.clone(entity_data=local_ctx) set2_scores = expr_eval(score, eval_ctx) cell2_idx = set2_scores.argmax() cell1ids = local_ctx['__ids__'] cell2ids = local_ctx['__other___ids__'][cell2_idx] if pool_size is not None and set2_size > pool_size: # transform pool-local index to set/matching_ctx index cell2_idx = pool[cell2_idx] cell1size = len(cell1ids) cell2size = len(cell2ids) nb_match = min(cell1size, cell2size) # we could introduce a random choice here but it is not # much necessary. In that case, it should be done in group_context ids1 = cell1ids[:nb_match] ids2 = cell2ids[:nb_match] result[id_to_rownum[ids1]] = ids2 result[id_to_rownum[ids2]] = ids1 if nb_match == cell2size: matching_ctx = context_delete(matching_ctx, cell2_idx) else: # other variables do not need to be modified since the cell # only got smaller and was not deleted matching_ctx['__other___ids__'][cell2_idx] = cell2ids[ nb_match:] # FIXME: the expr gets cached for the full matching_ctx at the # beginning and then when another women with the same values is # found, it thinks it can reuse the expr but it breaks because it # has not the correct length. # the current workaround is to invalidate the whole cache for the # current entity but this is not the right way to go. # * disable the cache for matching? # * use a local cache so that methods after matching() can use # what was in the cache before matching(). Shouldn't the cache be # stored inside the context anyway? expr_cache.invalidate(context.period, context.entity_name) if nb_match < cell1size: set1['__ids__'][sorted_idx] = cell1ids[nb_match:] match_cell(idx, sorted_idx, pool_size)
def align_link(self, context, score, need, filter, take, leave, expressions, possible_values, errors, frac_need, link, secondary_axis, method): target_context = link._target_context(context) need, expressions, possible_values = \ self._eval_need(context, need, expressions, possible_values, target_context) # handle secondary axis if isinstance(secondary_axis, Expr): axis_name = str(secondary_axis) try: secondary_axis = need.axes.names.index(axis_name) except ValueError: raise ValueError("invalid value for secondary_axis: there is " "no axis named '%s' in the need array" % axis_name) elif isinstance(secondary_axis, int): if secondary_axis >= need.ndim: raise Exception("%d is an invalid value for secondary_axis: " "it should be smaller than the number of " "dimension of the need array (%d)" % (secondary_axis, need.ndim)) else: assert secondary_axis is None # evaluate columns target_columns = [expr_eval(e, target_context) for e in expressions] # this is a one2many, so the link column is on the target side link_column = target_context[link._link_field] filter_expr = self._getfilter(context, filter) if filter_expr is not None: reverse_link = Many2One("reverse", link._link_field, context.entity.name) target_filter = LinkGet(reverse_link, filter_expr, False) target_filter_value = expr_eval(target_filter, target_context) # It is often not a good idea to pre-filter columns like this # because we loose information about "indices", but in this case, # it is fine, because we do not need that information afterwards. filtered_columns = [col[target_filter_value] if isinstance(col, np.ndarray) and col.shape else [col] for col in target_columns] link_column = link_column[target_filter_value] else: filtered_columns = target_columns target_filter_value = None # compute labels for filtered columns # ----------------------------------- # We can't use _group_labels_light because group_labels assigns labels # on a first come, first served basis, not using the order they are # in pvalues fcols_labels = [] filtered_length = len(filtered_columns[0]) unaligned = np.zeros(filtered_length, dtype=bool) # XXX: probably needs to use "possible_values" instead for fcol, pvalues in zip(filtered_columns, need.axes.labels): pvalues_index = dict((v, i) for i, v in enumerate(pvalues)) fcol_labels = np.empty(filtered_length, dtype=np.int32) for i in range(filtered_length): value_idx = pvalues_index.get(fcol[i], -1) if value_idx == -1: unaligned[i] = True fcol_labels[i] = value_idx fcols_labels.append(fcol_labels) num_unaligned = np.sum(unaligned) if num_unaligned: # further filter label columns and link_column validlabels = ~unaligned fcols_labels = [labels[validlabels] for labels in fcols_labels] link_column = link_column[validlabels] # display who are the evil ones ids = target_context['id'] if target_filter_value is not None: filtered_ids = ids[target_filter_value] else: filtered_ids = ids self._display_unaligned(expressions, filtered_ids, filtered_columns, unaligned) else: del unaligned id_to_rownum = context.id_to_rownum missing_int = missing_values[int] source_ids = link_column if len(id_to_rownum): source_rows = id_to_rownum[source_ids] # filter out missing values: those where the value of the link # points to nowhere (-1) source_rows[source_ids == missing_int] = missing_int else: assert np.all(source_ids == missing_int) source_rows = [] # filtered_columns are not filtered further on invalid labels # (num_unaligned) but this is not a problem since those will be # ignored by GroupBy anyway. # TODO: the result of this is ugly because a groupby on *values*, returns an LArray with those # values (ndarrays) as axes *names*. Ugh. groupby_expr = GroupBy(*filtered_columns, pvalues=possible_values) # FIXME: target_context is not correct, as it is not filtered while # filtered_columns are. Since we do not use the context "columns" it # mostly works but I had to disable an assertion in utils.expand # because the length of the context is not correct. num_candidates = expr_eval(groupby_expr, target_context) # fetch the list of linked individuals for each local individual. # e.g. the list of person ids for each household hh = np.empty(context_length(context), dtype=object) # we can't use .fill([]) because it reuses the same list for all # objects for i in range(len(hh)): hh[i] = [] # Even though this is highly sub-optimal, the time taken to create # those lists of ids is very small compared to the total time taken # for align_other (0.2s vs 4.26), so I shouldn't care too much about # it for now. # target_row (row of person) is an index valid for *filtered/label* # columns ! for target_row, source_row in enumerate(source_rows): if source_row == -1: continue hh[source_row].append(target_row) class FakeContainer(object): def __init__(self, length): self.length = length def __len__(self): return self.length groups = [FakeContainer(g) for g in num_candidates] need = need * self._get_need_correction(groups, possible_values) need = self._handle_frac_need(need, frac_need) need = self._add_past_error(context, need, errors) need = np.asarray(need) aligned, error = \ align_link_nd(score, need, num_candidates, hh, fcols_labels, secondary_axis) self.past_error = error return aligned
def compute(self, context, entity_name=None, filter=None, number=None, **kwargs): if filter is not None and number is not None: # Having neither is allowed, though, as there can be a contextual # filter. Also, there is no reason to prevent the whole # population giving birth, even though the usefulness of such # usage seem dubious. raise ValueError("new() 'filter' and 'number' arguments are " "mutually exclusive") source_entity = context.entity if entity_name is None: target_entity = source_entity else: target_entity = context.entities[entity_name] # target context is the context where the new individuals will be # created if target_entity is source_entity: target_context = context else: # we do need to copy the data (.extra) because we will insert into # the entity.array anyway => fresh_data=True target_context = context.clone(fresh_data=True, entity_name=target_entity.name) filter_expr = self._getfilter(context, filter) if filter_expr is not None: to_give_birth = expr_eval(filter_expr, context) num_birth = to_give_birth.sum() elif number is not None: to_give_birth = None num_birth = number else: to_give_birth = np.ones(len(context), dtype=bool) num_birth = len(context) array = target_entity.array default_values = target_entity.fields.default_values id_to_rownum = target_entity.id_to_rownum num_individuals = len(id_to_rownum) children = self._initial_values(array, to_give_birth, num_birth, default_values) if num_birth: children['id'] = np.arange(num_individuals, num_individuals + num_birth) children['period'] = context.period used_variables = [ v.name for v in self._collect_kwargs_variables(kwargs) ] if to_give_birth is None: assert not used_variables child_context = context.empty(num_birth) else: child_context = context.subset(to_give_birth, used_variables, filter_expr) for k, v in kwargs.items(): if k not in array.dtype.names: print("WARNING: {} is unknown, ignoring it!".format(k)) continue children[k] = expr_eval(v, child_context) add_individuals(target_context, children) expr_cache.invalidate(context.period, context.entity_name) # result is the ids of the new individuals corresponding to the source # entity if to_give_birth is not None: result = np.full(context_length(context), -1, dtype=int) if source_entity is target_entity: extra_bools = np.zeros(num_birth, dtype=bool) to_give_birth = np.concatenate((to_give_birth, extra_bools)) # Note that np.place is a bit faster, but is currently buggy when # working with columns of structured arrays. # See https://github.com/numpy/numpy/issues/2462 result[to_give_birth] = children['id'] return result else: return None
def compute(self, context, entity_name=None, filter=None, number=None, **kwargs): if filter is not None and number is not None: # Having neither is allowed, though, as there can be a contextual # filter. Also, there is no reason to prevent the whole # population giving birth, even though the usefulness of such # usage seem dubious. raise ValueError("new() 'filter' and 'number' arguments are " "mutually exclusive") source_entity = context.entity if entity_name is None: target_entity = source_entity else: target_entity = context.entities[entity_name] # target context is the context where the new individuals will be # created if target_entity is source_entity: target_context = context else: # we do need to copy the data (.extra) because we will insert into # the entity.array anyway => fresh_data=True target_context = context.clone(fresh_data=True, entity_name=target_entity.name) filter_expr = self._getfilter(context, filter) if filter_expr is not None: to_give_birth = expr_eval(filter_expr, context) num_birth = to_give_birth.sum() elif number is not None: to_give_birth = None num_birth = number else: to_give_birth = np.ones(len(context), dtype=bool) num_birth = len(context) array = target_entity.array default_values = target_entity.fields.default_values id_to_rownum = target_entity.id_to_rownum num_individuals = len(id_to_rownum) children = self._initial_values(array, to_give_birth, num_birth, default_values) if num_birth: children['id'] = np.arange(num_individuals, num_individuals + num_birth) children['period'] = context.period used_variables = [v.name for v in self._collect_kwargs_variables(kwargs)] if to_give_birth is None: assert not used_variables child_context = context.empty(num_birth) else: child_context = context.subset(to_give_birth, used_variables, filter_expr) for k, v in kwargs.items(): if k not in array.dtype.names: print("WARNING: {} is unknown, ignoring it!".format(k)) continue children[k] = expr_eval(v, child_context) add_individuals(target_context, children) expr_cache.invalidate(context.period, context.entity_name) # result is the ids of the new individuals corresponding to the source # entity if to_give_birth is not None: result = np.full(context_length(context), -1, dtype=int) if source_entity is target_entity: extra_bools = np.zeros(num_birth, dtype=bool) to_give_birth = np.concatenate((to_give_birth, extra_bools)) # Note that np.place is a bit faster, but is currently buggy when # working with columns of structured arrays. # See https://github.com/numpy/numpy/issues/2462 result[to_give_birth] = children['id'] return result else: return None
def compute(self, context, *expressions, **kwargs): if not expressions: raise TypeError("groupby() takes at least 1 argument") # TODO: allow lists/tuples of arguments to group by the combinations # of keys for expr in expressions: if isinstance(expr, (bool, int, float)): raise TypeError("groupby() does not work with constant " "arguments") if isinstance(expr, (tuple, list)): raise TypeError("groupby() takes expressions as arguments, " "not a list of expressions") # On python 3, we could clean up this code (keyword only arguments). expr = kwargs.pop('expr', None) if expr is None: expr = Count() # by = kwargs.pop('by', None) filter_value = kwargs.pop('filter', None) percent = kwargs.pop('percent', False) possible_values = kwargs.pop('pvalues', None) totals = kwargs.pop('totals', True) expr_vars = [v.name for v in collect_variables(expr)] labels = [str(e) for e in expressions] columns = [expr_eval(e, context) for e in expressions] columns = [expand(c, context_length(context)) for c in columns] if filter_value is not None: filtered_columns = [col[filter_value] for col in columns] # FIXME: use the actual filter_expr instead of not_hashable filtered_context = context.subset(filter_value, expr_vars, not_hashable) else: filtered_columns = columns filtered_context = context if possible_values is None: possible_values = [np.unique(col) for col in filtered_columns] # We pre-filtered columns instead of passing the filter to partition_nd # because it is a bit faster this way. The indices are still correct, # because we use them on a filtered_context. groups = partition_nd(filtered_columns, True, possible_values) if not groups: # return la.LArray([], labels, possible_values) return la.LArray([]) # evaluate the expression on each group # we use not_hashable to avoid storing the subset in the cache contexts = [ filtered_context.subset(indices, expr_vars, not_hashable) for indices in groups ] data = [expr_eval(expr, c) for c in contexts] # TODO: use group_indices_nd directly to avoid using np.unique # this is twice as fast (unique is very slow) but breaks because # the rest of the code assumes all combinations are present # if self.filter is not None: # filter_value = expr_eval(self.filter, context) # else: # filter_value = True # # d = group_indices_nd(columns, filter_value) # pvalues = sorted(d.keys()) # ndim = len(columns) # possible_values = [[pv[i] for pv in pvalues] # for i in range(ndim)] # groups = [d[k] for k in pvalues] # groups is a (flat) list of list. # the first variable is the outer-most "loop", # the last one the inner most. # add total for each row len_pvalues = [len(vals) for vals in possible_values] if percent: totals = True if totals: width = len_pvalues[-1] height = prod(len_pvalues[:-1]) rows_indices = [ np.concatenate([groups[y * width + x] for x in range(width)]) for y in range(height) ] cols_indices = [ np.concatenate([groups[y * width + x] for y in range(height)]) for x in range(width) ] cols_indices.append(np.concatenate(cols_indices)) # evaluate the expression on each "combined" group (ie compute totals) row_ctxs = [ filtered_context.subset(indices, expr_vars, not_hashable) for indices in rows_indices ] row_totals = [expr_eval(expr, ctx) for ctx in row_ctxs] col_ctxs = [ filtered_context.subset(indices, expr_vars, not_hashable) for indices in cols_indices ] col_totals = [expr_eval(expr, ctx) for ctx in col_ctxs] else: row_totals = None col_totals = None if percent: # convert to np.float64 to get +-inf if total_value is int(0) # instead of Python's built-in behaviour of raising an exception. # This can happen at least when using the default expr (count()) # and the filter yields empty groups total_value = np.float64(col_totals[-1]) data = [100.0 * value / total_value for value in data] row_totals = [100.0 * value / total_value for value in row_totals] col_totals = [100.0 * value / total_value for value in col_totals] # if self.by or self.percent: # if self.percent: # total_value = data[-1] # divisors = [total_value for _ in data] # else: # num_by = len(self.by) # inc = prod(len_pvalues[-num_by:]) # num_groups = len(groups) # num_categories = prod(len_pvalues[:-num_by]) # # categories_groups_idx = [range(cat_idx, num_groups, inc) # for cat_idx in range(num_categories)] # # divisors = ... # # data = [100.0 * value / divisor # for value, divisor in zip(data, divisors)] # convert to a 1d array. We don't simply use data = np.array(data), # because if data is a list of ndarray (for example if we use # groupby(a, expr=id), *and* all the ndarrays have the same length, # the result is a 2d array instead of an array of ndarrays like we # need (at this point). arr = np.empty(len(data), dtype=type(data[0])) arr[:] = data data = arr # and reshape it data = data.reshape(len_pvalues) axes = [ la.Axis(axis_labels, axis_name) for axis_name, axis_labels in zip(labels, possible_values) ] # FIXME: also handle totals return la.LArray(data, axes)
def align_link(self, context, score, need, filter, take, leave, expressions, possible_values, errors, frac_need, link, secondary_axis, method): target_context = link._target_context(context) need, expressions, possible_values = \ self._eval_need(context, need, expressions, possible_values, target_context) # handle secondary axis if isinstance(secondary_axis, Expr): axis_name = str(secondary_axis) try: secondary_axis = need.axes.names.index(axis_name) except ValueError: raise ValueError("invalid value for secondary_axis: there is " "no axis named '%s' in the need array" % axis_name) elif isinstance(secondary_axis, int): if secondary_axis >= need.ndim: raise Exception("%d is an invalid value for secondary_axis: " "it should be smaller than the number of " "dimension of the need array (%d)" % (secondary_axis, need.ndim)) else: assert secondary_axis is None # evaluate columns target_columns = [expr_eval(e, target_context) for e in expressions] # this is a one2many, so the link column is on the target side link_column = target_context[link._link_field] filter_expr = self._getfilter(context, filter) if filter_expr is not None: reverse_link = Many2One("reverse", link._link_field, context.entity.name) target_filter = LinkGet(reverse_link, filter_expr, False) target_filter_value = expr_eval(target_filter, target_context) # It is often not a good idea to pre-filter columns like this # because we loose information about "indices", but in this case, # it is fine, because we do not need that information afterwards. filtered_columns = [ col[target_filter_value] if isinstance(col, np.ndarray) and col.shape else [col] for col in target_columns ] link_column = link_column[target_filter_value] else: filtered_columns = target_columns target_filter_value = None # compute labels for filtered columns # ----------------------------------- # We can't use _group_labels_light because group_labels assigns labels # on a first come, first served basis, not using the order they are # in pvalues fcols_labels = [] filtered_length = len(filtered_columns[0]) unaligned = np.zeros(filtered_length, dtype=bool) # XXX: probably needs to use "possible_values" instead for fcol, pvalues in zip(filtered_columns, need.axes.labels): pvalues_index = dict((v, i) for i, v in enumerate(pvalues)) fcol_labels = np.empty(filtered_length, dtype=np.int32) for i in range(filtered_length): value_idx = pvalues_index.get(fcol[i], -1) if value_idx == -1: unaligned[i] = True fcol_labels[i] = value_idx fcols_labels.append(fcol_labels) num_unaligned = np.sum(unaligned) if num_unaligned: # further filter label columns and link_column validlabels = ~unaligned fcols_labels = [labels[validlabels] for labels in fcols_labels] link_column = link_column[validlabels] # display who are the evil ones ids = target_context['id'] if target_filter_value is not None: filtered_ids = ids[target_filter_value] else: filtered_ids = ids self._display_unaligned(expressions, filtered_ids, filtered_columns, unaligned) else: del unaligned id_to_rownum = context.id_to_rownum missing_int = missing_values[int] source_ids = link_column if len(id_to_rownum): source_rows = id_to_rownum[source_ids] # filter out missing values: those where the value of the link # points to nowhere (-1) source_rows[source_ids == missing_int] = missing_int else: assert np.all(source_ids == missing_int) source_rows = [] # filtered_columns are not filtered further on invalid labels # (num_unaligned) but this is not a problem since those will be # ignored by GroupBy anyway. # TODO: the result of this is ugly because a groupby on *values*, returns an LArray with those # values (ndarrays) as axes *names*. Ugh. groupby_expr = GroupBy(*filtered_columns, pvalues=possible_values) # FIXME: target_context is not correct, as it is not filtered while # filtered_columns are. Since we do not use the context "columns" it # mostly works but I had to disable an assertion in utils.expand # because the length of the context is not correct. num_candidates = expr_eval(groupby_expr, target_context) # fetch the list of linked individuals for each local individual. # e.g. the list of person ids for each household hh = np.empty(context_length(context), dtype=object) # we can't use .fill([]) because it reuses the same list for all # objects for i in range(len(hh)): hh[i] = [] # Even though this is highly sub-optimal, the time taken to create # those lists of ids is very small compared to the total time taken # for align_other (0.2s vs 4.26), so I shouldn't care too much about # it for now. # target_row (row of person) is an index valid for *filtered/label* # columns ! for target_row, source_row in enumerate(source_rows): if source_row == -1: continue hh[source_row].append(target_row) class FakeContainer(object): def __init__(self, length): self.length = length def __len__(self): return self.length groups = [FakeContainer(g) for g in num_candidates] need = need * self._get_need_correction(groups, possible_values) need = self._handle_frac_need(need, frac_need) need = self._add_past_error(context, need, errors) need = np.asarray(need) aligned, error = \ align_link_nd(score, need, num_candidates, hh, fcols_labels, secondary_axis) self.past_error = error return aligned
def compute(self, context, *expressions, **kwargs): if not expressions: raise TypeError("groupby() takes at least 1 argument") # TODO: allow lists/tuples of arguments to group by the combinations # of keys for expr in expressions: if isinstance(expr, (bool, int, float)): raise TypeError("groupby() does not work with constant " "arguments") if isinstance(expr, (tuple, list)): raise TypeError("groupby() takes expressions as arguments, " "not a list of expressions") # On python 3, we could clean up this code (keyword only arguments). expr = kwargs.pop('expr', None) if expr is None: expr = Count() # by = kwargs.pop('by', None) filter_value = kwargs.pop('filter', None) percent = kwargs.pop('percent', False) possible_values = kwargs.pop('pvalues', None) totals = kwargs.pop('totals', True) expr_vars = [v.name for v in collect_variables(expr)] labels = [str(e) for e in expressions] columns = [expr_eval(e, context) for e in expressions] columns = [expand(c, context_length(context)) for c in columns] if filter_value is not None: filtered_columns = [col[filter_value] for col in columns] # FIXME: use the actual filter_expr instead of not_hashable filtered_context = context.subset(filter_value, expr_vars, not_hashable) else: filtered_columns = columns filtered_context = context if possible_values is None: possible_values = [np.unique(col) for col in filtered_columns] # We pre-filtered columns instead of passing the filter to partition_nd # because it is a bit faster this way. The indices are still correct, # because we use them on a filtered_context. groups = partition_nd(filtered_columns, True, possible_values) if not groups: # return la.LArray([], labels, possible_values) return la.LArray([]) # evaluate the expression on each group # we use not_hashable to avoid storing the subset in the cache contexts = [filtered_context.subset(indices, expr_vars, not_hashable) for indices in groups] data = [expr_eval(expr, c) for c in contexts] # TODO: use group_indices_nd directly to avoid using np.unique # this is twice as fast (unique is very slow) but breaks because # the rest of the code assumes all combinations are present # if self.filter is not None: # filter_value = expr_eval(self.filter, context) # else: # filter_value = True # # d = group_indices_nd(columns, filter_value) # pvalues = sorted(d.keys()) # ndim = len(columns) # possible_values = [[pv[i] for pv in pvalues] # for i in range(ndim)] # groups = [d[k] for k in pvalues] # groups is a (flat) list of list. # the first variable is the outer-most "loop", # the last one the inner most. # add total for each row len_pvalues = [len(vals) for vals in possible_values] if percent: totals = True if totals: width = len_pvalues[-1] height = prod(len_pvalues[:-1]) rows_indices = [np.concatenate([groups[y * width + x] for x in range(width)]) for y in range(height)] cols_indices = [np.concatenate([groups[y * width + x] for y in range(height)]) for x in range(width)] cols_indices.append(np.concatenate(cols_indices)) # evaluate the expression on each "combined" group (ie compute totals) row_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable) for indices in rows_indices] row_totals = [expr_eval(expr, ctx) for ctx in row_ctxs] col_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable) for indices in cols_indices] col_totals = [expr_eval(expr, ctx) for ctx in col_ctxs] else: row_totals = None col_totals = None if percent: # convert to np.float64 to get +-inf if total_value is int(0) # instead of Python's built-in behaviour of raising an exception. # This can happen at least when using the default expr (count()) # and the filter yields empty groups total_value = np.float64(col_totals[-1]) data = [100.0 * value / total_value for value in data] row_totals = [100.0 * value / total_value for value in row_totals] col_totals = [100.0 * value / total_value for value in col_totals] # if self.by or self.percent: # if self.percent: # total_value = data[-1] # divisors = [total_value for _ in data] # else: # num_by = len(self.by) # inc = prod(len_pvalues[-num_by:]) # num_groups = len(groups) # num_categories = prod(len_pvalues[:-num_by]) # # categories_groups_idx = [range(cat_idx, num_groups, inc) # for cat_idx in range(num_categories)] # # divisors = ... # # data = [100.0 * value / divisor # for value, divisor in zip(data, divisors)] # convert to a 1d array. We don't simply use data = np.array(data), # because if data is a list of ndarray (for example if we use # groupby(a, expr=id), *and* all the ndarrays have the same length, # the result is a 2d array instead of an array of ndarrays like we # need (at this point). arr = np.empty(len(data), dtype=type(data[0])) arr[:] = data data = arr # and reshape it data = data.reshape(len_pvalues) axes = [la.Axis(axis_labels, axis_name) for axis_name, axis_labels in zip(labels, possible_values)] # FIXME: also handle totals return la.LArray(data, axes)
def compute(self, context, set1filter, set2filter, score, orderby, pool_size=None, algo='onebyone'): global matching_ctx if pool_size is not None: assert isinstance(pool_size, int) assert pool_size > 0 set1filterexpr = self._getfilter(context, set1filter) set1filtervalue = expr_eval(set1filterexpr, context) set2filterexpr = self._getfilter(context, set2filter) set2filtervalue = expr_eval(set2filterexpr, context) set1len = set1filtervalue.sum() set2len = set2filtervalue.sum() print("matching with %d/%d individuals" % (set1len, set2len), end='') varnames = {v.name for v in score.collect_variables()} used_variables1 = {n for n in varnames if not n.startswith('__other_')} used_variables2 = {n[8:] for n in varnames if n.startswith('__other_')} if isinstance(orderby, str): assert orderby == 'EDtM' orderby_vars = used_variables1 else: orderby_vars = {v.name for v in orderby.collect_variables()} if algo == 'onebyone': all_vars = {'id'} | used_variables1 | orderby_vars set1 = context.subset(set1filtervalue, all_vars, set1filterexpr) set2 = context.subset(set2filtervalue, {'id'} | used_variables2, set2filterexpr) # subset creates a dict for the current entity, so .entity_data is a # dict set1 = set1.entity_data set2 = set2.entity_data set1['__ids__'] = set1['id'].reshape(set1len, 1) set2['__ids__'] = set2['id'].reshape(set2len, 1) print() else: # optimized matching by grouping sets by values, which usually # means smaller sets and improved running time. assert algo == 'byvalue' # if orderby contains variables that are not used in the score # expression, this will effectively add variables in the # matching context AND group by those variables. This is correct # because otherwise (if we did not group by them), we could have # groups containing individuals with different values of the # ordering variables (ie the ordering would not be respected). set1 = group_context(used_variables1 | orderby_vars, set1filtervalue, context) set2 = group_context(used_variables2, set2filtervalue, context) # we cannot simply take the [:min(set1len, set2len)] indices like in # the non-optimized case and iterate over that because we don't know # how many groups we will need to match. print(" (%d/%d groups)" % (context_length(set1), context_length(set2))) if isinstance(orderby, str): orderbyvalue = np.zeros(context_length(set1)) for name in used_variables1: column = set1[name] orderbyvalue += (column - column.mean())**2 / column.var() else: orderbyvalue = expr_eval(orderby, context.clone(entity_data=set1)) # Delete variables which are not in the score expression (but in the # orderby expr or possibly "id") because they are no longer needed and # would slow things down. context_keep(set1, used_variables1) context_keep(set2, used_variables2) sorted_set1_indices = orderbyvalue.argsort()[::-1] result = np.full(context_length(context), -1, dtype=int) id_to_rownum = context.id_to_rownum # prefix all keys except __len__ matching_ctx = { '__other_' + k if k != '__len__' else k: v for k, v in set2.items() } def match_cell(idx, sorted_idx, pool_size): global matching_ctx set2_size = context_length(matching_ctx) if not set2_size: raise StopIteration if pool_size is not None and set2_size > pool_size: pool = random.sample(range(set2_size), pool_size) local_ctx = context_subset(matching_ctx, pool) else: local_ctx = matching_ctx.copy() local_ctx.update((k, set1[k][sorted_idx]) for k in {'__ids__'} | used_variables1) eval_ctx = context.clone(entity_data=local_ctx) set2_scores = expr_eval(score, eval_ctx) cell2_idx = set2_scores.argmax() cell1ids = local_ctx['__ids__'] cell2ids = local_ctx['__other___ids__'][cell2_idx] if pool_size is not None and set2_size > pool_size: # transform pool-local index to set/matching_ctx index cell2_idx = pool[cell2_idx] cell1size = len(cell1ids) cell2size = len(cell2ids) nb_match = min(cell1size, cell2size) # we could introduce a random choice here but it is not # much necessary. In that case, it should be done in group_context ids1 = cell1ids[:nb_match] ids2 = cell2ids[:nb_match] result[id_to_rownum[ids1]] = ids2 result[id_to_rownum[ids2]] = ids1 if nb_match == cell2size: matching_ctx = context_delete(matching_ctx, cell2_idx) else: # other variables do not need to be modified since the cell # only got smaller and was not deleted matching_ctx['__other___ids__'][cell2_idx] = cell2ids[ nb_match:] # FIXME: the expr gets cached for the full matching_ctx at the # beginning and then when another women with the same values is # found, it thinks it can reuse the expr but it breaks because it # has not the correct length. # the current workaround is to invalidate the whole cache for the # current entity but this is not the right way to go. # * disable the cache for matching? # * use a local cache so that methods after matching() can use # what was in the cache before matching(). Shouldn't the cache be # stored inside the context anyway? expr_cache.invalidate(context.period, context.entity_name) if nb_match < cell1size: set1['__ids__'][sorted_idx] = cell1ids[nb_match:] match_cell(idx, sorted_idx, pool_size) loop_wh_progress(match_cell, sorted_set1_indices, pool_size) return result