def evaluate(self, context): values = expr_eval(self.expr, context) values = np.asarray(values) filter_expr = self._getfilter(context) if filter_expr is not None: filter_values = expr_eval(filter_expr, context) else: filter_values = True if self.skip_na: # we should *not* use an inplace operation because filter_values # can be a simple variable filter_values = filter_values & ispresent(values) if filter_values is not True: values = values[filter_values] # from Wikipedia: # G = 1/n * (n + 1 - 2 * (sum((n + 1 - i) * a[i]) / sum(a[i]))) # i=1..n i=1..n # but sum((n + 1 - i) * a[i]) # i=1..n # = sum((n - i) * a[i] for i in range(n)) # = sum(cumsum(a)) sorted_values = np.sort(values) n = len(values) # force float to avoid overflows with integer input expressions cumsum = np.cumsum(sorted_values, dtype=float) values_sum = cumsum[-1] if values_sum == 0: print("gini(%s, filter=%s): expression is all zeros (or nan) " "for filter" % (self.expr, filter_expr)) return (n + 1 - 2 * np.sum(cumsum) / values_sum) / n
def run(self, context): plt.figure() args = [expr_eval(arg, context) for arg in self.args] kwargs = dict((k, expr_eval(v, context)) for k, v in self.kwargs.iteritems()) self._draw(*args, **kwargs) plt.show()
def evaluate(self, context): # from Wikipedia: # G = 1/n * (n + 1 - 2 * (sum((n + 1 - i) * a[i]) / sum(a[i]))) # i=1..n i=1..n # but sum((n + 1 - i) * a[i]) # i=1..n # = sum((n - i) * a[i] for i in range(n)) # = sum(cumsum(a)) values = expr_eval(self.expr, context) if isinstance(values, (list, tuple)): values = np.array(values) filter_expr = self._getfilter(context) if filter_expr is not None: filter_values = expr_eval(filter_expr, context) else: filter_values = True filter_values &= ispresent(values) values = values[filter_values] sorted_values = np.sort(values) n = len(values) # force float to avoid overflows with integer input expressions cumsum = np.cumsum(sorted_values, dtype=float) values_sum = cumsum[-1] return (n + 1 - 2 * np.sum(cumsum) / values_sum) / n
def eval_assertion(self, context): v1 = expr_eval(self.expr1, context) v2 = expr_eval(self.expr2, context) if not self.compare(v1, v2): op = self.inv_op return "%s %s %s (%s %s %s)" % (self.expr1, op, self.expr2, v1, op, v2)
def compute(self, context, set1filter, set2filter, orderby1, orderby2): set1filterexpr = self._getfilter(context, set1filter) set1filtervalue = expr_eval(set1filterexpr, context) set2filterexpr = self._getfilter(context, set2filter) set2filtervalue = expr_eval(set2filterexpr, context) set1len = set1filtervalue.sum() set2len = set2filtervalue.sum() numtomatch = min(set1len, set2len) print("matching with %d/%d individuals" % (set1len, set2len)) result = np.full(context_length(context), -1, dtype=int) if not numtomatch: return result sorted_set1_indices = orderby1[set1filtervalue].argsort()[-numtomatch:] sorted_set2_indices = orderby2[set2filtervalue].argsort()[-numtomatch:] set1ids = context['id'][set1filtervalue] set2ids = context['id'][set2filtervalue] id_to_rownum = context.id_to_rownum id1 = set1ids[sorted_set1_indices] id2 = set2ids[sorted_set2_indices] # cannot use sorted_setX_indices because those are "local" indices result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1 return result
def evaluate(self, context): if config.debug: print() print("random sequence position before:", np.random.get_state()[2]) num = context_length(context) choices = self.choices if num: bins = self.bins if bins is None: # all values have the same probability choices_idx = np.random.randint(len(choices), size=num) else: if any(isinstance(b, Expr) for b in bins): weights = [expr_eval(expr, context) for expr in bins] bins = self._weights_to_bins(weights) u = np.random.uniform(size=num) #XXX: np.choice uses searchsorted(bins, u) instead of digitize choices_idx = np.digitize(u, bins) - 1 else: choices_idx = [] if config.debug: print("random sequence position after:", np.random.get_state()[2]) if any(isinstance(c, Expr) for c in choices): choices = np.array([expr_eval(expr, context) for expr in choices]) return choices[choices_idx]
def eval_assertion(self, context, exception, expr): try: expr_eval(expr) return "did not raise" except eval(exception): return False except Exception as e: return "raised another exception (%s)" % e
def eval_assertion(self, context): r1 = expr_eval(self.expr1, context) r2 = expr_eval(self.expr2, context) if isinstance(r1, np.ndarray) and isinstance(r2, np.ndarray): passed = np.array_equal(r1, r2) else: passed = r1 == r2 if not passed: return "%s != %s (%s != %s)" % (r1, r2, self.expr1, self.expr2)
def evaluate(self, context): if self.filter is not None: filter_value = expr_eval(self.filter, context) else: filter_value = None if self.expressions: expressions = list(self.expressions) else: # extra=False because we don't want globals nor "system" variables # (nan, period, __xxx__) expressions = [Variable(name) for name in context.keys(extra=False)] str_expressions = [str(e) for e in expressions] if 'id' not in str_expressions: str_expressions.insert(0, 'id') expressions.insert(0, Variable('id')) id_pos = 0 else: id_pos = str_expressions.index('id') # if (self.periods is not None and len(self.periods) and # 'period' not in str_expressions): # str_expressions.insert(0, 'period') # expressions.insert(0, Variable('period')) # id_pos += 1 columns = [] for expr in expressions: expr_value = expr_eval(expr, context) if (filter_value is not None and isinstance(expr_value, np.ndarray) and expr_value.shape): expr_value = expr_value[filter_value] columns.append(expr_value) ids = columns[id_pos] if isinstance(ids, np.ndarray) and ids.shape: numrows = len(ids) else: numrows = 1 # expand scalar columns to full columns in memory for idx, col in enumerate(columns): dtype = None if not isinstance(col, np.ndarray): dtype = type(col) elif not col.shape: dtype = col.dtype.type if dtype is not None: newcol = np.empty(numrows, dtype=dtype) newcol.fill(col) columns[idx] = newcol data = izip(*columns) table = chain([str_expressions], data) if self.header else data return PrettyTable(table, self.missing)
def evaluate(self, context): args = [expr_eval(arg, context) for arg in self.args] kwargs = dict((k, expr_eval(v, context)) for k, v in self.kwargs.iteritems()) if 'size' in self.arg_names and 'size' not in kwargs: kwargs['size'] = context_length(context) if self.filter_expr is None: filter_value = None else: filter_value = expr_eval(self.filter_expr, context) func = self.np_func[0] return self.compute(func, args, kwargs, filter_value)
def value_for_period(self, expr, period, context, fill='auto'): sub_context = EntityContext(self, {'period': period}) result = expr_eval(expr, sub_context) if isinstance(result, np.ndarray) and result.shape: ids = expr_eval(Variable('id'), sub_context) if fill is None: return ids, result else: # expand values to the current "outer" context return self.fill_missing_values(ids, result, context, fill) else: return result
def eval_assertion(self, context): v1 = expr_eval(self.expr1, context) v2 = expr_eval(self.expr2, context) result = self.compare(v1, v2) if isinstance(result, tuple): result, details = result else: details = '' if not result: op = self.inv_op return "%s %s %s (%s %s %s)%s" % (self.expr1, op, self.expr2, v1, op, v2, details)
def _eval_need(self, context, scores, filter_value): expressions = self.expressions possible_values = self.possible_values if isinstance(self.need, (tuple, list)): need = np.array([expr_eval(e, context) for e in self.need]) elif isinstance(self.need, Expr): need = expr_eval(self.need, context) # need was a *scalar* expr if not (isinstance(need, np.ndarray) and need.shape): need = np.array([need]) else: need = self.need if self.need[0] is None and self.method == "sidewalk": #Note: need is calculated over score and we could think of # calculate without leave_filter and without take_filter if filter_value is not None: scores = scores[filter_value] need = int(sum(scores)) need = np.array([need]) if isinstance(need, LabeledArray): if not expressions: expressions = [Variable(expressions_context.entity, name) for name in need.dim_names] if not possible_values: possible_values = need.pvalues assert isinstance(need, np.ndarray) if len(expressions) != len(possible_values): raise Exception("align() expressions and possible_values " "have different length: %d vs %d" % (len(expressions), len(possible_values))) if 'period' in [str(e) for e in expressions]: period = context.period expressions, possible_values, need = \ kill_axis('period', period, expressions, possible_values, need, abs(self.periodicity_given)) # kill any axis where the value is constant for all individuals # satisfying the filter # tokill = [(expr, column[0]) # for expr, column in zip(expressions, columns) # if isconstant(column, filter_value)] # for expr, value in tokill: # expressions, possible_values, need = \ # kill_axis(str(expr), value, expressions, possible_values, # need) return need, expressions, possible_values
def align_no_link(self, context, score, need, filter, take, leave, expressions, possible_values, errors, frac_need, link, secondary_axis, method): ctx_length = context_length(context) need, expressions, possible_values = \ self._eval_need(context, need, expressions, possible_values) filter_value = expr_eval(self._getfilter(context, filter), context) if filter_value is not None: num_to_align = np.sum(filter_value) else: num_to_align = ctx_length # retrieve the columns we need to work with if expressions: columns = [expr_eval(expr, context) for expr in expressions] if filter_value is not None: groups = partition_nd(columns, filter_value, possible_values) else: groups = partition_nd(columns, True, possible_values) else: columns = [] if filter_value is not None: groups = [filter_to_indices(filter_value)] else: groups = [np.arange(num_to_align)] # the sum is not necessarily equal to len(a), because some individuals # might not fit in any group (eg if some alignment data is missing) if sum(len(g) for g in groups) < num_to_align: unaligned = np.ones(ctx_length, dtype=bool) if filter_value is not None: unaligned[~filter_value] = False for member_indices in groups: unaligned[member_indices] = False self._display_unaligned(expressions, context['id'], columns, unaligned) # noinspection PyAugmentAssignment need = need * self._get_need_correction(groups, possible_values) need = self._handle_frac_need(need, frac_need) need = self._add_past_error(context, need, errors) need = np.asarray(need) # FIXME: either handle past_error in no link (currently, the past # error is added... but never computed, so always 0 !) or raise # an error in case errors='carry" is used with no link. return align_get_indices_nd(ctx_length, groups, need, filter_value, score, take, leave, method)
def align_no_link(self, context): ctx_length = context_length(context) scores = expr_eval(self.expr, context) need, expressions, possible_values = self._eval_need(context) filter_value = expr_eval(self._getfilter(context), context) take_filter = expr_eval(self.take_filter, context) leave_filter = expr_eval(self.leave_filter, context) if filter_value is not None: num_to_align = np.sum(filter_value) else: num_to_align = ctx_length if expressions: # retrieve the columns we need to work with columns = [expr_eval(expr, context) for expr in expressions] if filter_value is not None: groups = partition_nd(columns, filter_value, possible_values) else: groups = partition_nd(columns, True, possible_values) else: columns = [] if filter_value is not None: groups = [filter_to_indices(filter_value)] else: groups = [np.arange(num_to_align)] # the sum is not necessarily equal to len(a), because some individuals # might not fit in any group (eg if some alignment data is missing) if sum(len(g) for g in groups) < num_to_align: unaligned = np.ones(ctx_length, dtype=bool) if filter_value is not None: unaligned[~filter_value] = False for member_indices in groups: unaligned[member_indices] = False self._display_unaligned(expressions, context['id'], columns, unaligned) #noinspection PyAugmentAssignment need = need * self._get_need_correction(groups, possible_values) need = self._handle_frac_need(need) need = self._add_past_error(need, context) return align_get_indices_nd(ctx_length, groups, need, filter_value, scores, take_filter, leave_filter)
def evaluate(self, context): expr = self.expr filter_expr = self._getfilter(context) if filter_expr is not None: expr *= filter_expr return np.nansum(expr_eval(expr, context))
def compute(self, context, bool_expr): entity = context.entity baseperiod = entity.base_period period = context.period - 1 value = expr_eval(bool_expr, context) # using a full int so that the "store" type check works result = value.astype(np.int) res_size = len(entity.array) last_period_true = np.full(res_size, period + 1, dtype=np.int) id_to_rownum = context.id_to_rownum still_running = value.copy() while np.any(still_running) and period >= baseperiod: ids, values = self.value_for_period(bool_expr, period, context, fill=None) missing = np.ones(res_size, dtype=bool) period_value = np.zeros(res_size, dtype=bool) if len(ids): value_rows = id_to_rownum[ids] safe_put(missing, value_rows, False) safe_put(period_value, value_rows, values) value = still_running & period_value result += value * (last_period_true - period) still_running &= period_value | missing last_period_true[period_value] = period period -= 1 return result
def compute(self, context, expr, filter=None, skip_na=True): # FIXME: either take "contextual filter" into account here (by using # self._getfilter), or don't do it in sum & gini if filter is not None: tmpvar = self.add_tmp_var(context, filter) if getdtype(expr, context) is bool: # convert expr to int because mul_bbb is not implemented in # numexpr # expr *= 1 expr = BinaryOp('*', expr, 1) # expr *= filter_values expr = BinaryOp('*', expr, tmpvar) else: filter = True values = expr_eval(expr, context) values = np.asarray(values) if skip_na: # we should *not* use an inplace operation because filter can be a # simple variable filter = filter & ispresent(values) if filter is True: numrows = len(values) else: numrows = np.sum(filter) if numrows: if skip_na: return na_sum(values) / float(numrows) else: return np.sum(values) / float(numrows) else: return float('nan')
def compute(self, context, expr, filter=None, skip_na=True): values = np.asarray(expr) filter_expr = self._getfilter(context, filter) if filter_expr is not None: filter_values = expr_eval(filter_expr, context) else: filter_values = True if skip_na: # we should *not* use an inplace operation because filter_values # can be a simple variable filter_values = filter_values & ispresent(values) if filter_values is not True: values = values[filter_values] # from Wikipedia: # G = 1/n * (n + 1 - 2 * (sum((n + 1 - i) * a[i]) / sum(a[i]))) # i=1..n i=1..n # but sum((n + 1 - i) * a[i]) # i=1..n # = sum((n - i) * a[i] for i in range(n)) # = sum(cumsum(a)) sorted_values = np.sort(values) n = len(values) # force float to avoid overflows with integer input expressions cumsum = np.cumsum(sorted_values, dtype=float) values_sum = cumsum[-1] if values_sum == 0: print("gini(%s, filter=%s): expression is all zeros (or nan) " "for filter" % (self.args[0], filter)) return (n + 1 - 2 * np.sum(cumsum) / values_sum) / n
def evaluate(self, context): entity = context['__entity__'] baseperiod = entity.base_period period = context['period'] - 1 bool_expr = self.expr value = expr_eval(bool_expr, context) # using a full int so that the "store" type check works result = value.astype(np.int) res_size = len(entity.array) last_period_true = np.empty(res_size, dtype=np.int) last_period_true.fill(period + 1) id_to_rownum = context.id_to_rownum still_running = value.copy() while np.any(still_running) and period >= baseperiod: ids, values = entity.value_for_period(bool_expr, period, context, fill=None) missing = np.ones(res_size, dtype=bool) period_value = np.zeros(res_size, dtype=bool) if len(ids): value_rows = id_to_rownum[ids] safe_put(missing, value_rows, False) safe_put(period_value, value_rows, values) value = still_running & period_value result += value * (last_period_true - period) still_running &= period_value | missing last_period_true[period_value] = period period -= 1 return result
def value_for_period(self, expr, period, context, fill='auto'): sub_context = EntityContext(self, {'periods': [period], 'period_idx': 0, 'format_date': context['format_date'], '__globals__': context['__globals__']}) result = expr_eval(expr, sub_context) if isinstance(result, np.ndarray) and result.shape: ids = expr_eval(Variable('id'), sub_context) if fill is None: return ids, result else: # expand values to the current "outer" context return self.fill_missing_values(ids, result, context, fill) else: return result
def run_guarded(self, context): while expr_eval(self.cond, context): self.code.run_guarded(context) # FIXME: this is a bit brutal :) This is necessary because # otherwise test_while loops indefinitely (because "values" is # never incremented) expr_cache.clear()
def match_one_set1_individual_pool(idx, sorted_idx, pool_size): global local_ctx set2_size = context_length(local_ctx) if not set2_size: raise StopIteration if set2_size > pool_size: pool = random.sample(xrange(context_length(local_ctx)), pool_size) else: pool = range(set2_size) sub_local_ctx = context_subset(local_ctx, pool, None) sub_local_ctx.update((k, set1[k][sorted_idx]) for k in ['id'] + used_variables1) set2_scores = expr_eval(score_expr, sub_local_ctx) individual2_pool_idx = np.argmax(set2_scores) individual2_idx = pool[individual2_pool_idx] id1 = sub_local_ctx['id'] id2 = local_ctx['__other_id'][individual2_idx] local_ctx = context_delete(local_ctx, individual2_idx) result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1
def match_one_set1_individual(idx, sorted_idx): global local_ctx if not context_length(local_ctx): raise StopIteration local_ctx.update((k, set1[k][sorted_idx]) for k in ['id'] + used_variables1) # pk = tuple(individual1[fname] for fname in pk_names) # optimized_expr = optimized_exprs.get(pk) # if optimized_expr is None: # for name in pk_names: # fake_set1['__f_%s' % name].value = individual1[name] # optimized_expr = str(symbolic_expr.simplify()) # optimized_exprs[pk] = optimized_expr # set2_scores = evaluate(optimized_expr, mm_dict, set2) set2_scores = expr_eval(score_expr, local_ctx) individual2_idx = np.argmax(set2_scores) id1 = local_ctx['id'] id2 = local_ctx['__other_id'][individual2_idx] local_ctx = context_delete(local_ctx, individual2_idx) result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1
def compute(self, context, link, target_expr, missing_value=None): """ link must be a Link instance target_expr can be any expression (it will be evaluated on the target rows) """ assert isinstance(link, Link) assert isinstance(target_expr, Expr), str(type(target_expr)) #noinspection PyProtectedMember target_ids = context[link._link_field] target_context = self.target_context(context) id_to_rownum = target_context.id_to_rownum missing_int = missing_values[int] target_rows = id_to_rownum[target_ids] target_values = expr_eval(target_expr, target_context) if missing_value is None: missing_value = get_missing_value(target_values) result_values = target_values[target_rows] # it is a bit faster with numexpr (mixed_links: 0.22s -> 0.17s) return ne.evaluate("where((ids != mi) & (rows != mi), values, mv)", {'ids': target_ids, 'rows': target_rows, 'values': result_values, 'mi': missing_int, 'mv': missing_value})
def run(self, context): value = expr_eval(self.expr, context) # Assignment to a field with a name == None is valid: it simply means # the result must not be stored. This happens when a user does not # store anywhere the result of an expression (it usually has side # effects -- csv, new, remove, ...). if self.name is not None: self.store_result(value, context)
def compute(self, context, link, target_expr, target_filter=None): # assert isinstance(context, EntityContext), \ # "one2many aggregates in groupby are currently not supported" assert isinstance(link, One2Many), "%s (%s)" % (link, type(link)) # eg (in household entity): # persons: {type: one2many, target: person, field: hh_id} target_context = link._target_context(context) # this is a one2many, so the link column is on the target side #noinspection PyProtectedMember source_ids = target_context[link._link_field] expr_value = expr_eval(target_expr, target_context) filter_value = expr_eval(target_filter, target_context) if filter_value is not None: source_ids = source_ids[filter_value] # intentionally not using np.isscalar because of some corner # cases, eg. None and np.array(1.0) if isinstance(expr_value, np.ndarray) and expr_value.shape: expr_value = expr_value[filter_value] missing_int = missing_values[int] id_to_rownum = context.id_to_rownum if len(id_to_rownum): try: source_rows = id_to_rownum[source_ids] except: import pdb pdb.set_trace() # filter out missing values: those where the value of the link # points to nowhere (-1) #XXX: use np.putmask(source_rows, source_ids == missing_int, # missing_int) source_rows[source_ids == missing_int] = missing_int else: assert np.all(source_ids == missing_int) # we need to make a copy because eval_rows modifies the array # in place in some cases (countlink and descendants) #TODO: document this fact in eval_rows source_rows = source_ids.copy() if isinstance(expr_value, np.ndarray) and expr_value.shape: assert len(source_rows) == len(expr_value), \ "%d != %d" % (len(source_rows), len(expr_value)) return self.eval_rows(source_rows, expr_value, context)
def run_guarded(self, simulation, const_dict): while True: context = EntityContext(self.entity, const_dict.copy()) cond_value = expr_eval(self.cond, context) if not cond_value: break self.code.run_guarded(simulation, const_dict)
def evaluate(self, context): link = self.get_link(context) target_ids = expr_eval(Variable(link._link_field), context) target_context = self.target_context(context) id_to_rownum = target_context.id_to_rownum missing_int = missing_values[int] target_rows = id_to_rownum[target_ids] target_values = expr_eval(self.target_expression, target_context) missing_value = self.missing_value if missing_value is None: missing_value = get_missing_value(target_values) valid_link = (target_ids != missing_int) & (target_rows != missing_int) return np.where(valid_link, target_values[target_rows], missing_value)
def run(self, context): entity = context['__entity__'] period = context['period'] fname = self.fname.format(entity=entity.name, period=period) print "writing to", fname, "...", file_path = os.path.join(config.output_directory, fname) with open(file_path, self.mode + 'b') as f: dataWriter = csv.writer(f) for arg in self.args: if isinstance(arg, TableExpression): data = expr_eval(arg, context) elif isinstance(arg, (list, tuple)): data = [[expr_eval(expr, context) for expr in arg]] else: data = [[expr_eval(arg, context)]] dataWriter.writerows(data)
def compute(self, context, expr, filter=None, skip_na=True): filter_expr = self._getfilter(context, filter) if filter_expr is not None: expr = BinaryOp('*', expr, filter_expr) values = expr_eval(expr, context) values = np.asarray(values) return na_sum(values) if skip_na else np.sum(values)
def create_cost(idx, sorted_idx): global cost if not context_length(local_ctx): raise StopIteration local_ctx.update((k, set1[k][sorted_idx]) for k in used_variables1) set2_scores = expr_eval(score_expr, local_ctx) cost.append(set2_scores[:].tolist())
def value_for_period(expr, period, context, fill='auto'): sub_context = context.clone(fresh_data=True, period=period) result = expr_eval(expr, sub_context) if isinstance(result, np.ndarray) and result.shape: ids = sub_context['id'] if fill is None: return ids, result else: # expand values to the current "outer" context return TimeFunction.fill_missing_values( ids, result, context, fill) else: return result
def run_guarded(self, context, *args, **kwargs): # XXX: wouldn't some form of cascading context make all this junk much # cleaner? Context(globalvars, localvars) (globalvars contain both # entity fields and global temporaries) backup = self.backup_and_purge_locals() if len(args) != len(self.argnames): raise TypeError("%s() takes exactly %d arguments (%d given)" % (self.name, len(self.argnames), len(args))) for name in self.argnames: if name in self.entity.fields.names: raise ValueError("function '%s' cannot have an argument named " "'%s' because there is a field with the " "same name" % (self.name, name)) # contextual filter should not transfer to the called function (even # if that would somewhat make sense) because in many cases the # columns used in the contextual filter are not available within the # called function. This is only relevant for functions called within # an if() expression. context = context.clone(filter_expr=None) # add arguments to the local namespace for name, value in zip(self.argnames, args): # backup the variable if it existed in the caller namespace if name in self.entity.temp_variables: # we can safely assign to backup without checking if that name # was already assigned because it is not possible for a variable # to be both in entity.temp_variables and in backup (they are # removed from entity.temp_variables). backup[name] = self.entity.temp_variables.pop(name) # cannot use context[name] = value because that would store the # value in .extra, which is wiped at the start of each process # and we need it to be available across all processes of the # function self.entity.temp_variables[name] = value try: self.code.run_guarded(context) result = expr_eval(self.result, context) except ReturnException as r: result = r.result self.purge_and_restore_locals(backup) return result
def execute(self, s): entity = self.entity if entity is None: raise Exception(entity_required) period = self.period if period is None: raise Exception(period_required) entity_name = self.entity.name parse_ctx = self.parse_ctx.copy() local_parse_ctx = parse_ctx[entity_name].copy() # add all currently defined temp_variables because otherwise # local variables (defined within a function) wouldn't be available local_parse_ctx.update((name, Variable(entity, name)) for name in entity.temp_variables.keys()) parse_ctx[entity_name] = local_parse_ctx expr = parse(s, parse_ctx, interactive=True) result = expr_eval(expr, self.eval_ctx) if result is None: print("done.") return result
def compute(self, context, entity_name=None, filter=None, number=None, **kwargs): if filter is not None and number is not None: # Having neither is allowed, though, as there can be a contextual # filter. Also, there is no reason to prevent the whole # population giving birth, even though the usefulness of such # usage seem dubious. raise ValueError("new() 'filter' and 'number' arguments are " "mutually exclusive") source_entity = context.entity if entity_name is None: target_entity = source_entity else: target_entity = context.entities[entity_name] # target context is the context where the new individuals will be # created if target_entity is source_entity: target_context = context else: # we do need to copy the data (.extra) because we will insert into # the entity.array anyway => fresh_data=True target_context = context.clone(fresh_data=True, entity_name=target_entity.name) filter_expr = self._getfilter(context, filter) if filter_expr is not None: to_give_birth = expr_eval(filter_expr, context) num_birth = to_give_birth.sum() elif number is not None: to_give_birth = None num_birth = number else: to_give_birth = np.ones(len(context), dtype=bool) num_birth = len(context) array = target_entity.array default_values = target_entity.fields.default_values id_to_rownum = target_entity.id_to_rownum num_individuals = len(id_to_rownum) children = self._initial_values(array, to_give_birth, num_birth, default_values) if num_birth: children['id'] = np.arange(num_individuals, num_individuals + num_birth) children['period'] = context.period used_variables = [ v.name for v in self._collect_kwargs_variables(kwargs) ] if to_give_birth is None: assert not used_variables child_context = context.empty(num_birth) else: child_context = context.subset(to_give_birth, used_variables, filter_expr) for k, v in kwargs.iteritems(): if k not in array.dtype.names: print("WARNING: {} is unknown, ignoring it!".format(k)) continue children[k] = expr_eval(v, child_context) add_individuals(target_context, children) expr_cache.invalidate(context.period, context.entity_name) # result is the ids of the new individuals corresponding to the source # entity if to_give_birth is not None: result = np.full(context_length(context), -1, dtype=int) if source_entity is target_entity: extra_bools = np.zeros(num_birth, dtype=bool) to_give_birth = np.concatenate((to_give_birth, extra_bools)) # Note that np.place is a bit faster, but is currently buggy when # working with columns of structured arrays. # See https://github.com/numpy/numpy/issues/2462 result[to_give_birth] = children['id'] return result else: return None
def match_cell(idx, sorted_idx, pool_size): global matching_ctx set2_size = context_length(matching_ctx) if not set2_size: raise StopIteration if pool_size is not None and set2_size > pool_size: pool = random.sample(xrange(set2_size), pool_size) local_ctx = context_subset(matching_ctx, pool) else: local_ctx = matching_ctx.copy() local_ctx.update((k, set1[k][sorted_idx]) for k in {'__ids__'} | used_variables1) eval_ctx = context.clone(entity_data=local_ctx) set2_scores = expr_eval(score, eval_ctx) cell2_idx = set2_scores.argmax() cell1ids = local_ctx['__ids__'] cell2ids = local_ctx['__other___ids__'][cell2_idx] if pool_size is not None and set2_size > pool_size: # transform pool-local index to set/matching_ctx index cell2_idx = pool[cell2_idx] cell1size = len(cell1ids) cell2size = len(cell2ids) nb_match = min(cell1size, cell2size) # we could introduce a random choice here but it is not # much necessary. In that case, it should be done in group_context ids1 = cell1ids[:nb_match] ids2 = cell2ids[:nb_match] result[id_to_rownum[ids1]] = ids2 result[id_to_rownum[ids2]] = ids1 if nb_match == cell2size: matching_ctx = context_delete(matching_ctx, cell2_idx) else: # other variables do not need to be modified since the cell # only got smaller and was not deleted matching_ctx['__other___ids__'][cell2_idx] = cell2ids[nb_match:] # FIXME: the expr gets cached for the full matching_ctx at the # beginning and then when another women with the same values is # found, it thinks it can reuse the expr but it breaks because it # has not the correct length. # the current workaround is to invalidate the whole cache for the # current entity but this is not the right way to go. # * disable the cache for matching? # * use a local cache so that methods after matching() can use # what was in the cache before matching(). Shouldn't the cache be # stored inside the context anyway? expr_cache.invalidate(context.period, context.entity_name) if nb_match < cell1size: set1['__ids__'][sorted_idx] = cell1ids[nb_match:] match_cell(idx, sorted_idx, pool_size)
def run_guarded(self, context): raise ReturnException(expr_eval(self.result_expr, context))
def compute(self, context, *args, **kwargs): filter_value = kwargs.pop('filter', None) missing = kwargs.pop('missing', None) # periods = kwargs.pop('periods', None) header = kwargs.pop('header', True) limit = kwargs.pop('limit', None) entity = context.entity if args: expressions = list(args) else: # extra=False because we don't want globals nor "system" variables # (nan, period, __xxx__) # FIXME: we should also somehow "traverse" expressions in this case # too (args is ()) => all keys in the current context expressions = [ Variable(entity, name) for name in context.keys(extra=False) ] str_expressions = [str(e) for e in expressions] if 'id' not in str_expressions: str_expressions.insert(0, 'id') expressions.insert(0, Variable(entity, 'id')) id_pos = 0 else: id_pos = str_expressions.index('id') # if (self.periods is not None and len(self.periods) and # 'period' not in str_expressions): # str_expressions.insert(0, 'period') # expressions.insert(0, Variable('period')) # id_pos += 1 columns = [] for expr in expressions: if filter_value is False: # dtype does not matter much expr_value = np.empty(0) else: expr_value = expr_eval(expr, context) if (filter_value is not None and isinstance(expr_value, np.ndarray) and expr_value.shape): expr_value = expr_value[filter_value] columns.append(expr_value) ids = columns[id_pos] if isinstance(ids, np.ndarray) and ids.shape: numrows = len(ids) else: # FIXME: we need a test for this case (no idea how this can happen) numrows = 1 # expand scalar columns to full columns in memory # TODO: handle or explicitly reject columns wh ndim > 1 for idx, col in enumerate(columns): dtype = None if not isinstance(col, np.ndarray): dtype = type(col) elif not col.shape: dtype = col.dtype.type if dtype is not None: # TODO: try using itertools.repeat instead as it seems to be a # bit faster and would consume less memory (however, it might # not play very well with Pandas.to_csv) newcol = np.full(numrows, col, dtype=dtype) columns[idx] = newcol if limit is not None: assert isinstance(limit, (int, long)) columns = [col[:limit] for col in columns] data = izip(*columns) table = chain([str_expressions], data) if header else data return PrettyTable(table, missing)
def align_link(self, context, score, need, filter, take, leave, expressions, possible_values, errors, frac_need, link, secondary_axis, method): target_context = link._target_context(context) need, expressions, possible_values = \ self._eval_need(context, need, expressions, possible_values, target_context) # handle secondary axis if isinstance(secondary_axis, Expr): axis_name = str(secondary_axis) try: secondary_axis = need.dim_names.index(axis_name) except ValueError: raise ValueError("invalid value for secondary_axis: there is " "no axis named '%s' in the need array" % axis_name) else: if secondary_axis >= need.ndim: raise Exception("%d is an invalid value for secondary_axis: " "it should be smaller than the number of " "dimension of the need array (%d)" % (secondary_axis, need.ndim)) # evaluate columns target_columns = [expr_eval(e, target_context) for e in expressions] # this is a one2many, so the link column is on the target side link_column = target_context[link._link_field] filter_expr = self._getfilter(context, filter) if filter_expr is not None: reverse_link = Many2One("reverse", link._link_field, context.entity.name) target_filter = LinkGet(reverse_link, filter_expr, False) target_filter_value = expr_eval(target_filter, target_context) # It is often not a good idea to pre-filter columns like this # because we loose information about "indices", but in this case, # it is fine, because we do not need that information afterwards. filtered_columns = [ col[target_filter_value] if isinstance(col, np.ndarray) and col.shape else [col] for col in target_columns ] link_column = link_column[target_filter_value] else: filtered_columns = target_columns target_filter_value = None # compute labels for filtered columns # ----------------------------------- # We can't use _group_labels_light because group_labels assigns labels # on a first come, first served basis, not using the order they are # in pvalues fcols_labels = [] filtered_length = len(filtered_columns[0]) unaligned = np.zeros(filtered_length, dtype=bool) for fcol, pvalues in zip(filtered_columns, need.pvalues): pvalues_index = dict((v, i) for i, v in enumerate(pvalues)) fcol_labels = np.empty(filtered_length, dtype=np.int32) for i in range(filtered_length): value_idx = pvalues_index.get(fcol[i], -1) if value_idx == -1: unaligned[i] = True fcol_labels[i] = value_idx fcols_labels.append(fcol_labels) num_unaligned = np.sum(unaligned) if num_unaligned: # further filter label columns and link_column validlabels = ~unaligned fcols_labels = [labels[validlabels] for labels in fcols_labels] link_column = link_column[validlabels] # display who are the evil ones ids = target_context['id'] if target_filter_value is not None: filtered_ids = ids[target_filter_value] else: filtered_ids = ids self._display_unaligned(expressions, filtered_ids, filtered_columns, unaligned) else: del unaligned id_to_rownum = context.id_to_rownum missing_int = missing_values[int] source_ids = link_column if len(id_to_rownum): source_rows = id_to_rownum[source_ids] # filter out missing values: those where the value of the link # points to nowhere (-1) source_rows[source_ids == missing_int] = missing_int else: assert np.all(source_ids == missing_int) source_rows = [] # filtered_columns are not filtered further on invalid labels # (num_unaligned) but this is not a problem since those will be # ignored by GroupBy anyway. # TODO: this is ugly because a groupby on "values", returns an LArray with those # values (ndarrays) as axes names. Ugh. groupby_expr = GroupBy(*filtered_columns, pvalues=possible_values) # FIXME: target_context is not correct, as it is not filtered while # filtered_columns are. Since we do not use the context "columns" it # mostly works but I had to disable an assertion in utils.expand # because the length of the context is not correct. num_candidates = expr_eval(groupby_expr, target_context) # fetch the list of linked individuals for each local individual. # e.g. the list of person ids for each household hh = np.empty(context_length(context), dtype=object) # we can't use .fill([]) because it reuses the same list for all # objects for i in range(len(hh)): hh[i] = [] # Even though this is highly sub-optimal, the time taken to create # those lists of ids is very small compared to the total time taken # for align_other (0.2s vs 4.26), so I shouldn't care too much about # it for now. # target_row (row of person) is an index valid for *filtered/label* # columns ! for target_row, source_row in enumerate(source_rows): if source_row == -1: continue hh[source_row].append(target_row) class FakeContainer(object): def __init__(self, length): self.length = length def __len__(self): return self.length groups = [FakeContainer(g) for g in num_candidates] need = need * self._get_need_correction(groups, possible_values) need = self._handle_frac_need(need, frac_need) need = self._add_past_error(context, need, errors) # need = np.asarray(need) need = np.asarray(need) aligned, error = \ align_link_nd(score, need, num_candidates, hh, fcols_labels, secondary_axis) self.past_error = error return aligned
def compute(self, context, set1filter, set2filter, score, orderby, pool_size=None, algo='onebyone'): global matching_ctx if pool_size is not None: assert isinstance(pool_size, int) assert pool_size > 0 set1filterexpr = self._getfilter(context, set1filter) set1filtervalue = expr_eval(set1filterexpr, context) set2filterexpr = self._getfilter(context, set2filter) set2filtervalue = expr_eval(set2filterexpr, context) set1len = set1filtervalue.sum() set2len = set2filtervalue.sum() print("matching with %d/%d individuals" % (set1len, set2len), end='') varnames = {v.name for v in score.collect_variables()} used_variables1 = {n for n in varnames if not n.startswith('__other_')} used_variables2 = {n[8:] for n in varnames if n.startswith('__other_')} if isinstance(orderby, str): assert orderby == 'EDtM' orderby_vars = used_variables1 else: orderby_vars = {v.name for v in orderby.collect_variables()} if algo == 'onebyone': all_vars = {'id'} | used_variables1 | orderby_vars set1 = context.subset(set1filtervalue, all_vars, set1filterexpr) set2 = context.subset(set2filtervalue, {'id'} | used_variables2, set2filterexpr) # subset creates a dict for the current entity, so .entity_data is a # dict set1 = set1.entity_data set2 = set2.entity_data set1['__ids__'] = set1['id'].reshape(set1len, 1) set2['__ids__'] = set2['id'].reshape(set2len, 1) print() else: # optimized matching by grouping sets by values, which usually # means smaller sets and improved running time. assert algo == 'byvalue' # if orderby contains variables that are not used in the score # expression, this will effectively add variables in the # matching context AND group by those variables. This is correct # because otherwise (if we did not group by them), we could have # groups containing individuals with different values of the # ordering variables (ie the ordering would not be respected). set1 = group_context(used_variables1 | orderby_vars, set1filtervalue, context) set2 = group_context(used_variables2, set2filtervalue, context) # we cannot simply take the [:min(set1len, set2len)] indices like in # the non-optimized case and iterate over that because we don't know # how many groups we will need to match. print(" (%d/%d groups)" % (context_length(set1), context_length(set2))) if isinstance(orderby, str): orderbyvalue = np.zeros(context_length(set1)) for name in used_variables1: column = set1[name] orderbyvalue += (column - column.mean()) ** 2 / column.var() else: orderbyvalue = expr_eval(orderby, context.clone(entity_data=set1)) # Delete variables which are not in the score expression (but in the # orderby expr or possibly "id") because they are no longer needed and # would slow things down. context_keep(set1, used_variables1) context_keep(set2, used_variables2) sorted_set1_indices = orderbyvalue.argsort()[::-1] result = np.full(context_length(context), -1, dtype=int) id_to_rownum = context.id_to_rownum # prefix all keys except __len__ matching_ctx = {'__other_' + k if k != '__len__' else k: v for k, v in set2.iteritems()} def match_cell(idx, sorted_idx, pool_size): global matching_ctx set2_size = context_length(matching_ctx) if not set2_size: raise StopIteration if pool_size is not None and set2_size > pool_size: pool = random.sample(xrange(set2_size), pool_size) local_ctx = context_subset(matching_ctx, pool) else: local_ctx = matching_ctx.copy() local_ctx.update((k, set1[k][sorted_idx]) for k in {'__ids__'} | used_variables1) eval_ctx = context.clone(entity_data=local_ctx) set2_scores = expr_eval(score, eval_ctx) cell2_idx = set2_scores.argmax() cell1ids = local_ctx['__ids__'] cell2ids = local_ctx['__other___ids__'][cell2_idx] if pool_size is not None and set2_size > pool_size: # transform pool-local index to set/matching_ctx index cell2_idx = pool[cell2_idx] cell1size = len(cell1ids) cell2size = len(cell2ids) nb_match = min(cell1size, cell2size) # we could introduce a random choice here but it is not # much necessary. In that case, it should be done in group_context ids1 = cell1ids[:nb_match] ids2 = cell2ids[:nb_match] result[id_to_rownum[ids1]] = ids2 result[id_to_rownum[ids2]] = ids1 if nb_match == cell2size: matching_ctx = context_delete(matching_ctx, cell2_idx) else: # other variables do not need to be modified since the cell # only got smaller and was not deleted matching_ctx['__other___ids__'][cell2_idx] = cell2ids[nb_match:] # FIXME: the expr gets cached for the full matching_ctx at the # beginning and then when another women with the same values is # found, it thinks it can reuse the expr but it breaks because it # has not the correct length. # the current workaround is to invalidate the whole cache for the # current entity but this is not the right way to go. # * disable the cache for matching? # * use a local cache so that methods after matching() can use # what was in the cache before matching(). Shouldn't the cache be # stored inside the context anyway? expr_cache.invalidate(context.period, context.entity_name) if nb_match < cell1size: set1['__ids__'][sorted_idx] = cell1ids[nb_match:] match_cell(idx, sorted_idx, pool_size) loop_wh_progress(match_cell, sorted_set1_indices, pool_size) return result
def compute(self, context, *expressions, **kwargs): if not expressions: raise TypeError("groupby() takes at least 1 argument") # TODO: allow lists/tuples of arguments to group by the combinations # of keys for expr in expressions: if isinstance(expr, (bool, int, float)): raise TypeError("groupby() does not work with constant " "arguments") if isinstance(expr, (tuple, list)): raise TypeError("groupby() takes expressions as arguments, " "not a list of expressions") # On python 3, we could clean up this code (keyword only arguments). expr = kwargs.pop('expr', None) if expr is None: expr = Count() # by = kwargs.pop('by', None) filter_value = kwargs.pop('filter', None) percent = kwargs.pop('percent', False) possible_values = kwargs.pop('pvalues', None) expr_vars = [v.name for v in collect_variables(expr)] labels = [str(e) for e in expressions] columns = [expr_eval(e, context) for e in expressions] columns = [expand(c, context_length(context)) for c in columns] if filter_value is not None: filtered_columns = [col[filter_value] for col in columns] # FIXME: use the actual filter_expr instead of not_hashable filtered_context = context.subset(filter_value, expr_vars, not_hashable) else: filtered_columns = columns filtered_context = context if possible_values is None: possible_values = [np.unique(col) for col in filtered_columns] # We pre-filtered columns instead of passing the filter to partition_nd # because it is a bit faster this way. The indices are still correct, # because we use them on a filtered_context. groups = partition_nd(filtered_columns, True, possible_values) if not groups: return LabeledArray([], labels, possible_values) # evaluate the expression on each group # we use not_hashable to avoid storing the subset in the cache contexts = [filtered_context.subset(indices, expr_vars, not_hashable) for indices in groups] data = [expr_eval(expr, c) for c in contexts] # TODO: use group_indices_nd directly to avoid using np.unique # this is twice as fast (unique is very slow) but breaks because # the rest of the code assumes all combinations are present # if self.filter is not None: # filter_value = expr_eval(self.filter, context) # else: # filter_value = True # # d = group_indices_nd(columns, filter_value) # pvalues = sorted(d.keys()) # ndim = len(columns) # possible_values = [[pv[i] for pv in pvalues] # for i in range(ndim)] # groups = [d[k] for k in pvalues] # groups is a (flat) list of list. # the first variable is the outer-most "loop", # the last one the inner most. # add total for each row len_pvalues = [len(vals) for vals in possible_values] width = len_pvalues[-1] height = prod(len_pvalues[:-1]) rows_indices = [np.concatenate([groups[y * width + x] for x in range(width)]) for y in range(height)] cols_indices = [np.concatenate([groups[y * width + x] for y in range(height)]) for x in range(width)] cols_indices.append(np.concatenate(cols_indices)) # evaluate the expression on each "combined" group (ie compute totals) row_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable) for indices in rows_indices] row_totals = [expr_eval(expr, ctx) for ctx in row_ctxs] col_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable) for indices in cols_indices] col_totals = [expr_eval(expr, ctx) for ctx in col_ctxs] if percent: # convert to np.float64 to get +-inf if total_value is int(0) # instead of Python's built-in behaviour of raising an exception. # This can happen at least when using the default expr (count()) # and the filter yields empty groups total_value = np.float64(col_totals[-1]) data = [100.0 * value / total_value for value in data] row_totals = [100.0 * value / total_value for value in row_totals] col_totals = [100.0 * value / total_value for value in col_totals] # if self.by or self.percent: # if self.percent: # total_value = data[-1] # divisors = [total_value for _ in data] # else: # num_by = len(self.by) # inc = prod(len_pvalues[-num_by:]) # num_groups = len(groups) # num_categories = prod(len_pvalues[:-num_by]) # # categories_groups_idx = [range(cat_idx, num_groups, inc) # for cat_idx in range(num_categories)] # # divisors = ... # # data = [100.0 * value / divisor # for value, divisor in izip(data, divisors)] # convert to a 1d array. We don't simply use data = np.array(data), # because if data is a list of ndarray (for example if we use # groupby(a, expr=id), *and* all the ndarrays have the same length, # the result is a 2d array instead of an array of ndarrays like we # need (at this point). arr = np.empty(len(data), dtype=type(data[0])) arr[:] = data data = arr # and reshape it data = data.reshape(len_pvalues) return LabeledArray(data, labels, possible_values, row_totals, col_totals)