Beispiel #1
0
    def evaluate(self, context):
        values = expr_eval(self.expr, context)
        values = np.asarray(values)

        filter_expr = self._getfilter(context)
        if filter_expr is not None:
            filter_values = expr_eval(filter_expr, context)
        else:
            filter_values = True
        if self.skip_na:
            # we should *not* use an inplace operation because filter_values
            # can be a simple variable
            filter_values = filter_values & ispresent(values)
        if filter_values is not True:
            values = values[filter_values]

        # from Wikipedia:
        # G = 1/n * (n + 1 - 2 * (sum((n + 1 - i) * a[i]) / sum(a[i])))
        #                        i=1..n                    i=1..n
        # but sum((n + 1 - i) * a[i])
        #    i=1..n
        #   = sum((n - i) * a[i] for i in range(n))
        #   = sum(cumsum(a))
        sorted_values = np.sort(values)
        n = len(values)

        # force float to avoid overflows with integer input expressions
        cumsum = np.cumsum(sorted_values, dtype=float)
        values_sum = cumsum[-1]
        if values_sum == 0:
            print("gini(%s, filter=%s): expression is all zeros (or nan) "
                  "for filter" % (self.expr, filter_expr))
        return (n + 1 - 2 * np.sum(cumsum) / values_sum) / n
Beispiel #2
0
 def run(self, context):
     plt.figure()
     args = [expr_eval(arg, context) for arg in self.args]
     kwargs = dict((k, expr_eval(v, context))
                   for k, v in self.kwargs.iteritems())
     self._draw(*args, **kwargs)
     plt.show()
Beispiel #3
0
    def evaluate(self, context):
        # from Wikipedia:
        # G = 1/n * (n + 1 - 2 * (sum((n + 1 - i) * a[i]) / sum(a[i])))
        #                        i=1..n                    i=1..n
        # but sum((n + 1 - i) * a[i])
        #    i=1..n
        #   = sum((n - i) * a[i] for i in range(n))
        #   = sum(cumsum(a))
        values = expr_eval(self.expr, context)
        if isinstance(values, (list, tuple)):
            values = np.array(values)

        filter_expr = self._getfilter(context)
        if filter_expr is not None:
            filter_values = expr_eval(filter_expr, context)
        else:
            filter_values = True
        filter_values &= ispresent(values)
        values = values[filter_values]
        sorted_values = np.sort(values)
        n = len(values)

        # force float to avoid overflows with integer input expressions
        cumsum = np.cumsum(sorted_values, dtype=float)
        values_sum = cumsum[-1]
        return (n + 1 - 2 * np.sum(cumsum) / values_sum) / n
Beispiel #4
0
 def eval_assertion(self, context):
     v1 = expr_eval(self.expr1, context)
     v2 = expr_eval(self.expr2, context)
     if not self.compare(v1, v2):
         op = self.inv_op
         return "%s %s %s (%s %s %s)" % (self.expr1, op, self.expr2,
                                         v1, op, v2)
Beispiel #5
0
    def compute(self, context, set1filter, set2filter, orderby1, orderby2):
        set1filterexpr = self._getfilter(context, set1filter)
        set1filtervalue = expr_eval(set1filterexpr, context)
        set2filterexpr = self._getfilter(context, set2filter)
        set2filtervalue = expr_eval(set2filterexpr, context)
        set1len = set1filtervalue.sum()
        set2len = set2filtervalue.sum()
        numtomatch = min(set1len, set2len)
        print("matching with %d/%d individuals" % (set1len, set2len))
        result = np.full(context_length(context), -1, dtype=int)
        if not numtomatch:
            return result

        sorted_set1_indices = orderby1[set1filtervalue].argsort()[-numtomatch:]
        sorted_set2_indices = orderby2[set2filtervalue].argsort()[-numtomatch:]

        set1ids = context['id'][set1filtervalue]
        set2ids = context['id'][set2filtervalue]

        id_to_rownum = context.id_to_rownum
        id1 = set1ids[sorted_set1_indices]
        id2 = set2ids[sorted_set2_indices]
        # cannot use sorted_setX_indices because those are "local" indices
        result[id_to_rownum[id1]] = id2
        result[id_to_rownum[id2]] = id1
        return result
Beispiel #6
0
    def evaluate(self, context):
        if config.debug:
            print()
            print("random sequence position before:", np.random.get_state()[2])
        num = context_length(context)
        choices = self.choices
        if num:
            bins = self.bins
            if bins is None:
                # all values have the same probability
                choices_idx = np.random.randint(len(choices), size=num)
            else:
                if any(isinstance(b, Expr) for b in bins):
                    weights = [expr_eval(expr, context) for expr in bins]
                    bins = self._weights_to_bins(weights)
                u = np.random.uniform(size=num)
                #XXX: np.choice uses searchsorted(bins, u) instead of digitize
                choices_idx = np.digitize(u, bins) - 1
        else:
            choices_idx = []

        if config.debug:
            print("random sequence position after:", np.random.get_state()[2])

        if any(isinstance(c, Expr) for c in choices):
            choices = np.array([expr_eval(expr, context) for expr in choices])
        return choices[choices_idx]
Beispiel #7
0
 def eval_assertion(self, context, exception, expr):
     try:
         expr_eval(expr)
         return "did not raise"
     except eval(exception):
         return False
     except Exception as e:
         return "raised another exception (%s)" % e
Beispiel #8
0
 def eval_assertion(self, context, exception, expr):
     try:
         expr_eval(expr)
         return "did not raise"
     except eval(exception):
         return False
     except Exception as e:
         return "raised another exception (%s)" % e
Beispiel #9
0
 def eval_assertion(self, context):
     r1 = expr_eval(self.expr1, context)
     r2 = expr_eval(self.expr2, context)
     if isinstance(r1, np.ndarray) and isinstance(r2, np.ndarray):
         passed = np.array_equal(r1, r2)
     else:
         passed = r1 == r2
     if not passed:
         return "%s != %s (%s != %s)" % (r1, r2, self.expr1, self.expr2)
Beispiel #10
0
    def evaluate(self, context):
        if self.filter is not None:
            filter_value = expr_eval(self.filter, context)
        else:
            filter_value = None

        if self.expressions:
            expressions = list(self.expressions)
        else:
            # extra=False because we don't want globals nor "system" variables
            # (nan, period, __xxx__)
            expressions = [Variable(name)
                           for name in context.keys(extra=False)]

        str_expressions = [str(e) for e in expressions]
        if 'id' not in str_expressions:
            str_expressions.insert(0, 'id')
            expressions.insert(0, Variable('id'))
            id_pos = 0
        else:
            id_pos = str_expressions.index('id')

#        if (self.periods is not None and len(self.periods) and
#            'period' not in str_expressions):
#            str_expressions.insert(0, 'period')
#            expressions.insert(0, Variable('period'))
#            id_pos += 1

        columns = []
        for expr in expressions:
            expr_value = expr_eval(expr, context)
            if (filter_value is not None and isinstance(expr_value, np.ndarray)
                and expr_value.shape):
                expr_value = expr_value[filter_value]
            columns.append(expr_value)

        ids = columns[id_pos]
        if isinstance(ids, np.ndarray) and ids.shape:
            numrows = len(ids)
        else:
            numrows = 1

        # expand scalar columns to full columns in memory
        for idx, col in enumerate(columns):
            dtype = None
            if not isinstance(col, np.ndarray):
                dtype = type(col)
            elif not col.shape:
                dtype = col.dtype.type
            if dtype is not None:
                newcol = np.empty(numrows, dtype=dtype)
                newcol.fill(col)
                columns[idx] = newcol

        data = izip(*columns)
        table = chain([str_expressions], data) if self.header else data
        return PrettyTable(table, self.missing)
Beispiel #11
0
 def evaluate(self, context):
     args = [expr_eval(arg, context) for arg in self.args]
     kwargs = dict((k, expr_eval(v, context))
                   for k, v in self.kwargs.iteritems())
     if 'size' in self.arg_names and 'size' not in kwargs:
         kwargs['size'] = context_length(context)
     if self.filter_expr is None:
         filter_value = None
     else:
         filter_value = expr_eval(self.filter_expr, context)
     func = self.np_func[0]
     return self.compute(func, args, kwargs, filter_value)
Beispiel #12
0
 def value_for_period(self, expr, period, context, fill='auto'):
     sub_context = EntityContext(self, {'period': period})
     result = expr_eval(expr, sub_context)
     if isinstance(result, np.ndarray) and result.shape:
         ids = expr_eval(Variable('id'), sub_context)
         if fill is None:
             return ids, result
         else:
             # expand values to the current "outer" context
             return self.fill_missing_values(ids, result, context, fill)
     else:
         return result
Beispiel #13
0
 def eval_assertion(self, context):
     v1 = expr_eval(self.expr1, context)
     v2 = expr_eval(self.expr2, context)
     result = self.compare(v1, v2)
     if isinstance(result, tuple):
         result, details = result
     else:
         details = ''
     if not result:
         op = self.inv_op
         return "%s %s %s (%s %s %s)%s" % (self.expr1, op, self.expr2,
                                           v1, op, v2, details)
Beispiel #14
0
    def _eval_need(self, context, scores, filter_value):
        expressions = self.expressions
        possible_values = self.possible_values
        if isinstance(self.need, (tuple, list)):
            need = np.array([expr_eval(e, context) for e in self.need])
        elif isinstance(self.need, Expr):
            need = expr_eval(self.need, context)
            # need was a *scalar* expr
            if not (isinstance(need, np.ndarray) and need.shape):
                need = np.array([need])
        else:
            need = self.need
        
        if self.need[0] is None and self.method == "sidewalk":
        #Note: need is calculated over score and we could think of 
        # calculate without leave_filter and without take_filter
            if filter_value is not None:
                scores = scores[filter_value]
            need = int(sum(scores)) 
            need = np.array([need])

        if isinstance(need, LabeledArray):
            if not expressions:
                expressions = [Variable(expressions_context.entity, name)
                               for name in need.dim_names]
            if not possible_values:
                possible_values = need.pvalues

        assert isinstance(need, np.ndarray)

        if len(expressions) != len(possible_values):
            raise Exception("align() expressions and possible_values "
                            "have different length: %d vs %d"
                            % (len(expressions), len(possible_values)))

        if 'period' in [str(e) for e in expressions]:
            period = context.period
            expressions, possible_values, need = \
                kill_axis('period', period, expressions, possible_values,
                          need, abs(self.periodicity_given))

        # kill any axis where the value is constant for all individuals
        # satisfying the filter
#        tokill = [(expr, column[0])
#                  for expr, column in zip(expressions, columns)
#                  if isconstant(column, filter_value)]
#        for expr, value in tokill:
#            expressions, possible_values, need = \
#                kill_axis(str(expr), value, expressions, possible_values,
#                          need)

        return need, expressions, possible_values
Beispiel #15
0
    def align_no_link(self, context, score, need, filter, take, leave,
                      expressions, possible_values, errors, frac_need, link,
                      secondary_axis, method):

        ctx_length = context_length(context)

        need, expressions, possible_values = \
            self._eval_need(context, need, expressions, possible_values)

        filter_value = expr_eval(self._getfilter(context, filter), context)

        if filter_value is not None:
            num_to_align = np.sum(filter_value)
        else:
            num_to_align = ctx_length

        # retrieve the columns we need to work with
        if expressions:
            columns = [expr_eval(expr, context) for expr in expressions]
            if filter_value is not None:
                groups = partition_nd(columns, filter_value, possible_values)
            else:
                groups = partition_nd(columns, True, possible_values)
        else:
            columns = []
            if filter_value is not None:
                groups = [filter_to_indices(filter_value)]
            else:
                groups = [np.arange(num_to_align)]

        # the sum is not necessarily equal to len(a), because some individuals
        # might not fit in any group (eg if some alignment data is missing)
        if sum(len(g) for g in groups) < num_to_align:
            unaligned = np.ones(ctx_length, dtype=bool)
            if filter_value is not None:
                unaligned[~filter_value] = False
            for member_indices in groups:
                unaligned[member_indices] = False
            self._display_unaligned(expressions, context['id'], columns,
                                    unaligned)

        # noinspection PyAugmentAssignment
        need = need * self._get_need_correction(groups, possible_values)
        need = self._handle_frac_need(need, frac_need)
        need = self._add_past_error(context, need, errors)
        need = np.asarray(need)
        # FIXME: either handle past_error in no link (currently, the past
        #        error is added... but never computed, so always 0 !) or raise
        #        an error in case errors='carry" is used with no link.
        return align_get_indices_nd(ctx_length, groups, need, filter_value,
                                    score, take, leave, method)
Beispiel #16
0
    def align_no_link(self, context, score, need, filter, take, leave,
                      expressions, possible_values, errors, frac_need, link,
                      secondary_axis, method):

        ctx_length = context_length(context)

        need, expressions, possible_values = \
            self._eval_need(context, need, expressions, possible_values)

        filter_value = expr_eval(self._getfilter(context, filter), context)

        if filter_value is not None:
            num_to_align = np.sum(filter_value)
        else:
            num_to_align = ctx_length

        # retrieve the columns we need to work with
        if expressions:
            columns = [expr_eval(expr, context) for expr in expressions]
            if filter_value is not None:
                groups = partition_nd(columns, filter_value, possible_values)
            else:
                groups = partition_nd(columns, True, possible_values)
        else:
            columns = []
            if filter_value is not None:
                groups = [filter_to_indices(filter_value)]
            else:
                groups = [np.arange(num_to_align)]

        # the sum is not necessarily equal to len(a), because some individuals
        # might not fit in any group (eg if some alignment data is missing)
        if sum(len(g) for g in groups) < num_to_align:
            unaligned = np.ones(ctx_length, dtype=bool)
            if filter_value is not None:
                unaligned[~filter_value] = False
            for member_indices in groups:
                unaligned[member_indices] = False
            self._display_unaligned(expressions, context['id'], columns,
                                    unaligned)

        # noinspection PyAugmentAssignment
        need = need * self._get_need_correction(groups, possible_values)
        need = self._handle_frac_need(need, frac_need)
        need = self._add_past_error(context, need, errors)
        need = np.asarray(need)
        # FIXME: either handle past_error in no link (currently, the past
        #        error is added... but never computed, so always 0 !) or raise
        #        an error in case errors='carry" is used with no link.
        return align_get_indices_nd(ctx_length, groups, need, filter_value,
                                    score, take, leave, method)
Beispiel #17
0
    def align_no_link(self, context):
        ctx_length = context_length(context)

        scores = expr_eval(self.expr, context)

        need, expressions, possible_values = self._eval_need(context)

        filter_value = expr_eval(self._getfilter(context), context)
        take_filter = expr_eval(self.take_filter, context)
        leave_filter = expr_eval(self.leave_filter, context)

        if filter_value is not None:
            num_to_align = np.sum(filter_value)
        else:
            num_to_align = ctx_length

        if expressions:
            # retrieve the columns we need to work with
            columns = [expr_eval(expr, context) for expr in expressions]
            if filter_value is not None:
                groups = partition_nd(columns, filter_value, possible_values)
            else:
                groups = partition_nd(columns, True, possible_values)
        else:
            columns = []
            if filter_value is not None:
                groups = [filter_to_indices(filter_value)]
            else:
                groups = [np.arange(num_to_align)]

        # the sum is not necessarily equal to len(a), because some individuals
        # might not fit in any group (eg if some alignment data is missing)
        if sum(len(g) for g in groups) < num_to_align:
            unaligned = np.ones(ctx_length, dtype=bool)
            if filter_value is not None:
                unaligned[~filter_value] = False
            for member_indices in groups:
                unaligned[member_indices] = False
            self._display_unaligned(expressions, context['id'], columns,
                                    unaligned)

        #noinspection PyAugmentAssignment
        need = need * self._get_need_correction(groups, possible_values)
        need = self._handle_frac_need(need)
        need = self._add_past_error(need, context)

        return align_get_indices_nd(ctx_length, groups, need, filter_value,
                                    scores, take_filter, leave_filter)
Beispiel #18
0
    def evaluate(self, context):
        expr = self.expr
        filter_expr = self._getfilter(context)
        if filter_expr is not None:
            expr *= filter_expr

        return np.nansum(expr_eval(expr, context))
Beispiel #19
0
    def compute(self, context, bool_expr):
        entity = context.entity

        baseperiod = entity.base_period
        period = context.period - 1
        value = expr_eval(bool_expr, context)

        # using a full int so that the "store" type check works
        result = value.astype(np.int)
        res_size = len(entity.array)
        last_period_true = np.full(res_size, period + 1, dtype=np.int)

        id_to_rownum = context.id_to_rownum
        still_running = value.copy()
        while np.any(still_running) and period >= baseperiod:
            ids, values = self.value_for_period(bool_expr,
                                                period,
                                                context,
                                                fill=None)
            missing = np.ones(res_size, dtype=bool)
            period_value = np.zeros(res_size, dtype=bool)
            if len(ids):
                value_rows = id_to_rownum[ids]
                safe_put(missing, value_rows, False)
                safe_put(period_value, value_rows, values)

            value = still_running & period_value
            result += value * (last_period_true - period)

            still_running &= period_value | missing
            last_period_true[period_value] = period
            period -= 1
        return result
Beispiel #20
0
    def compute(self, context, expr, filter=None, skip_na=True):
        # FIXME: either take "contextual filter" into account here (by using
        # self._getfilter), or don't do it in sum & gini
        if filter is not None:
            tmpvar = self.add_tmp_var(context, filter)
            if getdtype(expr, context) is bool:
                # convert expr to int because mul_bbb is not implemented in
                # numexpr
                # expr *= 1
                expr = BinaryOp('*', expr, 1)
            # expr *= filter_values
            expr = BinaryOp('*', expr, tmpvar)
        else:
            filter = True

        values = expr_eval(expr, context)
        values = np.asarray(values)

        if skip_na:
            # we should *not* use an inplace operation because filter can be a
            # simple variable
            filter = filter & ispresent(values)

        if filter is True:
            numrows = len(values)
        else:
            numrows = np.sum(filter)

        if numrows:
            if skip_na:
                return na_sum(values) / float(numrows)
            else:
                return np.sum(values) / float(numrows)
        else:
            return float('nan')
Beispiel #21
0
    def compute(self, context, expr, filter=None, skip_na=True):
        values = np.asarray(expr)

        filter_expr = self._getfilter(context, filter)
        if filter_expr is not None:
            filter_values = expr_eval(filter_expr, context)
        else:
            filter_values = True
        if skip_na:
            # we should *not* use an inplace operation because filter_values
            # can be a simple variable
            filter_values = filter_values & ispresent(values)
        if filter_values is not True:
            values = values[filter_values]

        # from Wikipedia:
        # G = 1/n * (n + 1 - 2 * (sum((n + 1 - i) * a[i]) / sum(a[i])))
        #                        i=1..n                    i=1..n
        # but sum((n + 1 - i) * a[i])
        #    i=1..n
        #   = sum((n - i) * a[i] for i in range(n))
        #   = sum(cumsum(a))
        sorted_values = np.sort(values)
        n = len(values)

        # force float to avoid overflows with integer input expressions
        cumsum = np.cumsum(sorted_values, dtype=float)
        values_sum = cumsum[-1]
        if values_sum == 0:
            print("gini(%s, filter=%s): expression is all zeros (or nan) "
                  "for filter" % (self.args[0], filter))
        return (n + 1 - 2 * np.sum(cumsum) / values_sum) / n
Beispiel #22
0
    def evaluate(self, context):
        entity = context['__entity__']

        baseperiod = entity.base_period
        period = context['period'] - 1
        bool_expr = self.expr
        value = expr_eval(bool_expr, context)

        # using a full int so that the "store" type check works
        result = value.astype(np.int)
        res_size = len(entity.array)
        last_period_true = np.empty(res_size, dtype=np.int)
        last_period_true.fill(period + 1)

        id_to_rownum = context.id_to_rownum
        still_running = value.copy()
        while np.any(still_running) and period >= baseperiod:
            ids, values = entity.value_for_period(bool_expr, period, context,
                                                  fill=None)
            missing = np.ones(res_size, dtype=bool)
            period_value = np.zeros(res_size, dtype=bool)
            if len(ids):
                value_rows = id_to_rownum[ids]
                safe_put(missing, value_rows, False)
                safe_put(period_value, value_rows, values)

            value = still_running & period_value
            result += value * (last_period_true - period)

            still_running &= period_value | missing
            last_period_true[period_value] = period
            period -= 1
        return result
Beispiel #23
0
 def value_for_period(self, expr, period, context, fill='auto'):
     sub_context = EntityContext(self,
                                 {'periods': [period],
                                  'period_idx': 0,
                                  'format_date': context['format_date'],
                                  '__globals__': context['__globals__']})
     result = expr_eval(expr, sub_context)
     if isinstance(result, np.ndarray) and result.shape:
         ids = expr_eval(Variable('id'), sub_context)
         if fill is None:
             return ids, result
         else:
             # expand values to the current "outer" context
             return self.fill_missing_values(ids, result, context, fill)
     else:
         return result
Beispiel #24
0
    def compute(self, context, expr, filter=None, skip_na=True):
        # FIXME: either take "contextual filter" into account here (by using
        # self._getfilter), or don't do it in sum & gini
        if filter is not None:
            tmpvar = self.add_tmp_var(context, filter)
            if getdtype(expr, context) is bool:
                # convert expr to int because mul_bbb is not implemented in
                # numexpr
                # expr *= 1
                expr = BinaryOp('*', expr, 1)
            # expr *= filter_values
            expr = BinaryOp('*', expr, tmpvar)
        else:
            filter = True

        values = expr_eval(expr, context)
        values = np.asarray(values)

        if skip_na:
            # we should *not* use an inplace operation because filter can be a
            # simple variable
            filter = filter & ispresent(values)

        if filter is True:
            numrows = len(values)
        else:
            numrows = np.sum(filter)

        if numrows:
            if skip_na:
                return na_sum(values) / float(numrows)
            else:
                return np.sum(values) / float(numrows)
        else:
            return float('nan')
Beispiel #25
0
 def run_guarded(self, context):
     while expr_eval(self.cond, context):
         self.code.run_guarded(context)
         # FIXME: this is a bit brutal :) This is necessary because
         # otherwise test_while loops indefinitely (because "values" is
         # never incremented)
         expr_cache.clear()
Beispiel #26
0
            def match_one_set1_individual_pool(idx, sorted_idx, pool_size):
                global local_ctx
                
                set2_size = context_length(local_ctx)
                if not set2_size:
                    raise StopIteration
                
                if set2_size > pool_size:
                    pool = random.sample(xrange(context_length(local_ctx)), pool_size)
                else:
                    pool = range(set2_size)

                sub_local_ctx = context_subset(local_ctx, pool, None)
                sub_local_ctx.update((k, set1[k][sorted_idx]) for k in ['id'] + used_variables1)
                
                set2_scores = expr_eval(score_expr, sub_local_ctx)
    
                individual2_pool_idx = np.argmax(set2_scores)
                individual2_idx = pool[individual2_pool_idx]
                
                id1 = sub_local_ctx['id']
                id2 = local_ctx['__other_id'][individual2_idx]
    
                local_ctx = context_delete(local_ctx, individual2_idx)
    
                result[id_to_rownum[id1]] = id2
                result[id_to_rownum[id2]] = id1
Beispiel #27
0
         def match_one_set1_individual(idx, sorted_idx):
             global local_ctx
 
             if not context_length(local_ctx):
                 raise StopIteration
 
             local_ctx.update((k, set1[k][sorted_idx]) for k in ['id'] + used_variables1)
 
 #            pk = tuple(individual1[fname] for fname in pk_names)
 #            optimized_expr = optimized_exprs.get(pk)
 #            if optimized_expr is None:
 #                for name in pk_names:
 #                    fake_set1['__f_%s' % name].value = individual1[name]
 #                optimized_expr = str(symbolic_expr.simplify())
 #                optimized_exprs[pk] = optimized_expr
 #            set2_scores = evaluate(optimized_expr, mm_dict, set2)
 
             set2_scores = expr_eval(score_expr, local_ctx)
 
             individual2_idx = np.argmax(set2_scores)
 
             id1 = local_ctx['id']
             id2 = local_ctx['__other_id'][individual2_idx]
 
             local_ctx = context_delete(local_ctx, individual2_idx)
 
             result[id_to_rownum[id1]] = id2
             result[id_to_rownum[id2]] = id1            
Beispiel #28
0
 def run_guarded(self, context):
     while expr_eval(self.cond, context):
         self.code.run_guarded(context)
         # FIXME: this is a bit brutal :) This is necessary because
         # otherwise test_while loops indefinitely (because "values" is
         # never incremented)
         expr_cache.clear()
Beispiel #29
0
    def compute(self, context, link, target_expr, missing_value=None):
        """
        link must be a Link instance
        target_expr can be any expression (it will be evaluated on the
                          target rows)
        """
        assert isinstance(link, Link)
        assert isinstance(target_expr, Expr), str(type(target_expr))

        #noinspection PyProtectedMember
        target_ids = context[link._link_field]
        target_context = self.target_context(context)

        id_to_rownum = target_context.id_to_rownum

        missing_int = missing_values[int]
        target_rows = id_to_rownum[target_ids]

        target_values = expr_eval(target_expr, target_context)
        if missing_value is None:
            missing_value = get_missing_value(target_values)

        result_values = target_values[target_rows]

        # it is a bit faster with numexpr (mixed_links: 0.22s -> 0.17s)
        return ne.evaluate("where((ids != mi) & (rows != mi), values, mv)",
                           {'ids': target_ids, 'rows': target_rows,
                            'values': result_values, 'mi': missing_int,
                            'mv': missing_value})
Beispiel #30
0
 def run(self, context):
     value = expr_eval(self.expr, context)
     # Assignment to a field with a name == None is valid: it simply means
     # the result must not be stored. This happens when a user does not
     # store anywhere the result of an expression (it usually has side
     # effects -- csv, new, remove, ...).
     if self.name is not None:
         self.store_result(value, context)
Beispiel #31
0
    def compute(self, context, link, target_expr, target_filter=None):
        # assert isinstance(context, EntityContext), \
        #         "one2many aggregates in groupby are currently not supported"
        assert isinstance(link, One2Many), "%s (%s)" % (link, type(link))

        # eg (in household entity):
        # persons: {type: one2many, target: person, field: hh_id}
        target_context = link._target_context(context)
        # this is a one2many, so the link column is on the target side
        #noinspection PyProtectedMember
        source_ids = target_context[link._link_field]
        expr_value = expr_eval(target_expr, target_context)
        filter_value = expr_eval(target_filter, target_context)
        if filter_value is not None:
            source_ids = source_ids[filter_value]
            # intentionally not using np.isscalar because of some corner
            # cases, eg. None and np.array(1.0)
            if isinstance(expr_value, np.ndarray) and expr_value.shape:
                expr_value = expr_value[filter_value]

        missing_int = missing_values[int]

        id_to_rownum = context.id_to_rownum
        if len(id_to_rownum):
            try:
                source_rows = id_to_rownum[source_ids]
            except:
                import pdb
                pdb.set_trace()
            # filter out missing values: those where the value of the link
            # points to nowhere (-1)
            #XXX: use np.putmask(source_rows, source_ids == missing_int,
            #                    missing_int)
            source_rows[source_ids == missing_int] = missing_int
        else:
            assert np.all(source_ids == missing_int)
            # we need to make a copy because eval_rows modifies the array
            # in place in some cases (countlink and descendants)
            #TODO: document this fact in eval_rows
            source_rows = source_ids.copy()

        if isinstance(expr_value, np.ndarray) and expr_value.shape:
            assert len(source_rows) == len(expr_value), \
                "%d != %d" % (len(source_rows), len(expr_value))

        return self.eval_rows(source_rows, expr_value, context)
Beispiel #32
0
    def run_guarded(self, simulation, const_dict):
        while True:
            context = EntityContext(self.entity, const_dict.copy())
            cond_value = expr_eval(self.cond, context)
            if not cond_value:
                break

            self.code.run_guarded(simulation, const_dict)
Beispiel #33
0
 def run(self, context):
     value = expr_eval(self.expr, context)
     # Assignment to a field with a name == None is valid: it simply means
     # the result must not be stored. This happens when a user does not
     # store anywhere the result of an expression (it usually has side
     # effects -- csv, new, remove, ...).
     if self.name is not None:
         self.store_result(value, context)
Beispiel #34
0
    def evaluate(self, context):
        link = self.get_link(context)
        target_ids = expr_eval(Variable(link._link_field), context)
        target_context = self.target_context(context)

        id_to_rownum = target_context.id_to_rownum

        missing_int = missing_values[int]
        target_rows = id_to_rownum[target_ids]

        target_values = expr_eval(self.target_expression, target_context)
        missing_value = self.missing_value
        if missing_value is None:
            missing_value = get_missing_value(target_values)

        valid_link = (target_ids != missing_int) & (target_rows != missing_int)
        return np.where(valid_link, target_values[target_rows], missing_value)
Beispiel #35
0
    def run(self, context):
        entity = context['__entity__']
        period = context['period']
        fname = self.fname.format(entity=entity.name, period=period)
        print "writing to", fname, "...",
        file_path = os.path.join(config.output_directory, fname)

        with open(file_path, self.mode + 'b') as f:
            dataWriter = csv.writer(f)
            for arg in self.args:
                if isinstance(arg, TableExpression):
                    data = expr_eval(arg, context)
                elif isinstance(arg, (list, tuple)):
                    data = [[expr_eval(expr, context) for expr in arg]]
                else:
                    data = [[expr_eval(arg, context)]]
                dataWriter.writerows(data)
Beispiel #36
0
    def compute(self, context, expr, filter=None, skip_na=True):
        filter_expr = self._getfilter(context, filter)
        if filter_expr is not None:
            expr = BinaryOp('*', expr, filter_expr)

        values = expr_eval(expr, context)
        values = np.asarray(values)

        return na_sum(values) if skip_na else np.sum(values)
Beispiel #37
0
         def create_cost(idx, sorted_idx):
 
             global cost
             if not context_length(local_ctx):
                 raise StopIteration
             local_ctx.update((k, set1[k][sorted_idx]) for k in used_variables1)
 
             set2_scores = expr_eval(score_expr, local_ctx)
             cost.append(set2_scores[:].tolist())
    def compute(self, context, expr, filter=None, skip_na=True):
        filter_expr = self._getfilter(context, filter)
        if filter_expr is not None:
            expr = BinaryOp('*', expr, filter_expr)

        values = expr_eval(expr, context)
        values = np.asarray(values)

        return na_sum(values) if skip_na else np.sum(values)
Beispiel #39
0
 def value_for_period(expr, period, context, fill='auto'):
     sub_context = context.clone(fresh_data=True, period=period)
     result = expr_eval(expr, sub_context)
     if isinstance(result, np.ndarray) and result.shape:
         ids = sub_context['id']
         if fill is None:
             return ids, result
         else:
             # expand values to the current "outer" context
             return TimeFunction.fill_missing_values(
                 ids, result, context, fill)
     else:
         return result
Beispiel #40
0
    def run_guarded(self, context, *args, **kwargs):
        # XXX: wouldn't some form of cascading context make all this junk much
        # cleaner? Context(globalvars, localvars) (globalvars contain both
        # entity fields and global temporaries)

        backup = self.backup_and_purge_locals()

        if len(args) != len(self.argnames):
            raise TypeError("%s() takes exactly %d arguments (%d given)" %
                            (self.name, len(self.argnames), len(args)))

        for name in self.argnames:
            if name in self.entity.fields.names:
                raise ValueError("function '%s' cannot have an argument named "
                                 "'%s' because there is a field with the "
                                 "same name" % (self.name, name))

        # contextual filter should not transfer to the called function (even
        # if that would somewhat make sense) because in many cases the
        # columns used in the contextual filter are not available within the
        # called function. This is only relevant for functions called within
        # an if() expression.
        context = context.clone(filter_expr=None)

        # add arguments to the local namespace
        for name, value in zip(self.argnames, args):
            # backup the variable if it existed in the caller namespace
            if name in self.entity.temp_variables:
                # we can safely assign to backup without checking if that name
                # was already assigned because it is not possible for a variable
                # to be both in entity.temp_variables and in backup (they are
                # removed from entity.temp_variables).
                backup[name] = self.entity.temp_variables.pop(name)

            # cannot use context[name] = value because that would store the
            # value in .extra, which is wiped at the start of each process
            # and we need it to be available across all processes of the
            # function
            self.entity.temp_variables[name] = value
        try:
            self.code.run_guarded(context)
            result = expr_eval(self.result, context)
        except ReturnException as r:
            result = r.result
        self.purge_and_restore_locals(backup)
        return result
Beispiel #41
0
    def execute(self, s):
        entity = self.entity
        if entity is None:
            raise Exception(entity_required)

        period = self.period
        if period is None:
            raise Exception(period_required)

        entity_name = self.entity.name
        parse_ctx = self.parse_ctx.copy()
        local_parse_ctx = parse_ctx[entity_name].copy()

        # add all currently defined temp_variables because otherwise
        # local variables (defined within a function) wouldn't be available
        local_parse_ctx.update((name, Variable(entity, name))
                               for name in entity.temp_variables.keys())
        parse_ctx[entity_name] = local_parse_ctx
        expr = parse(s, parse_ctx, interactive=True)
        result = expr_eval(expr, self.eval_ctx)
        if result is None:
            print("done.")
        return result
Beispiel #42
0
    def compute(self,
                context,
                entity_name=None,
                filter=None,
                number=None,
                **kwargs):
        if filter is not None and number is not None:
            # Having neither is allowed, though, as there can be a contextual
            # filter. Also, there is no reason to prevent the whole
            # population giving birth, even though the usefulness of such
            # usage seem dubious.
            raise ValueError("new() 'filter' and 'number' arguments are "
                             "mutually exclusive")
        source_entity = context.entity
        if entity_name is None:
            target_entity = source_entity
        else:
            target_entity = context.entities[entity_name]

        # target context is the context where the new individuals will be
        # created
        if target_entity is source_entity:
            target_context = context
        else:
            # we do need to copy the data (.extra) because we will insert into
            # the entity.array anyway => fresh_data=True
            target_context = context.clone(fresh_data=True,
                                           entity_name=target_entity.name)

        filter_expr = self._getfilter(context, filter)
        if filter_expr is not None:
            to_give_birth = expr_eval(filter_expr, context)
            num_birth = to_give_birth.sum()
        elif number is not None:
            to_give_birth = None
            num_birth = number
        else:
            to_give_birth = np.ones(len(context), dtype=bool)
            num_birth = len(context)

        array = target_entity.array
        default_values = target_entity.fields.default_values

        id_to_rownum = target_entity.id_to_rownum
        num_individuals = len(id_to_rownum)

        children = self._initial_values(array, to_give_birth, num_birth,
                                        default_values)
        if num_birth:
            children['id'] = np.arange(num_individuals,
                                       num_individuals + num_birth)
            children['period'] = context.period

            used_variables = [
                v.name for v in self._collect_kwargs_variables(kwargs)
            ]
            if to_give_birth is None:
                assert not used_variables
                child_context = context.empty(num_birth)
            else:
                child_context = context.subset(to_give_birth, used_variables,
                                               filter_expr)
            for k, v in kwargs.iteritems():
                if k not in array.dtype.names:
                    print("WARNING: {} is unknown, ignoring it!".format(k))
                    continue
                children[k] = expr_eval(v, child_context)

        add_individuals(target_context, children)

        expr_cache.invalidate(context.period, context.entity_name)

        # result is the ids of the new individuals corresponding to the source
        # entity
        if to_give_birth is not None:
            result = np.full(context_length(context), -1, dtype=int)
            if source_entity is target_entity:
                extra_bools = np.zeros(num_birth, dtype=bool)
                to_give_birth = np.concatenate((to_give_birth, extra_bools))
            # Note that np.place is a bit faster, but is currently buggy when
            # working with columns of structured arrays.
            # See https://github.com/numpy/numpy/issues/2462
            result[to_give_birth] = children['id']
            return result
        else:
            return None
Beispiel #43
0
        def match_cell(idx, sorted_idx, pool_size):
            global matching_ctx

            set2_size = context_length(matching_ctx)
            if not set2_size:
                raise StopIteration

            if pool_size is not None and set2_size > pool_size:
                pool = random.sample(xrange(set2_size), pool_size)
                local_ctx = context_subset(matching_ctx, pool)
            else:
                local_ctx = matching_ctx.copy()

            local_ctx.update((k, set1[k][sorted_idx])
                             for k in {'__ids__'} | used_variables1)

            eval_ctx = context.clone(entity_data=local_ctx)
            set2_scores = expr_eval(score, eval_ctx)
            cell2_idx = set2_scores.argmax()

            cell1ids = local_ctx['__ids__']
            cell2ids = local_ctx['__other___ids__'][cell2_idx]

            if pool_size is not None and set2_size > pool_size:
                # transform pool-local index to set/matching_ctx index
                cell2_idx = pool[cell2_idx]

            cell1size = len(cell1ids)
            cell2size = len(cell2ids)
            nb_match = min(cell1size, cell2size)

            # we could introduce a random choice here but it is not
            # much necessary. In that case, it should be done in group_context
            ids1 = cell1ids[:nb_match]
            ids2 = cell2ids[:nb_match]

            result[id_to_rownum[ids1]] = ids2
            result[id_to_rownum[ids2]] = ids1
            
            if nb_match == cell2size:
                matching_ctx = context_delete(matching_ctx, cell2_idx)
            else:
                # other variables do not need to be modified since the cell
                # only got smaller and was not deleted
                matching_ctx['__other___ids__'][cell2_idx] = cell2ids[nb_match:]

            # FIXME: the expr gets cached for the full matching_ctx at the
            # beginning and then when another women with the same values is
            # found, it thinks it can reuse the expr but it breaks because it
            # has not the correct length.

            # the current workaround is to invalidate the whole cache for the
            # current entity but this is not the right way to go.
            # * disable the cache for matching?
            # * use a local cache so that methods after matching() can use
            # what was in the cache before matching(). Shouldn't the cache be
            # stored inside the context anyway?
            expr_cache.invalidate(context.period, context.entity_name)

            if nb_match < cell1size:
                set1['__ids__'][sorted_idx] = cell1ids[nb_match:]
                match_cell(idx, sorted_idx, pool_size)
Beispiel #44
0
 def run_guarded(self, context):
     raise ReturnException(expr_eval(self.result_expr, context))
Beispiel #45
0
    def compute(self, context, *args, **kwargs):
        filter_value = kwargs.pop('filter', None)
        missing = kwargs.pop('missing', None)
        # periods = kwargs.pop('periods', None)
        header = kwargs.pop('header', True)
        limit = kwargs.pop('limit', None)
        entity = context.entity

        if args:
            expressions = list(args)
        else:
            # extra=False because we don't want globals nor "system" variables
            # (nan, period, __xxx__)
            # FIXME: we should also somehow "traverse" expressions in this case
            # too (args is ()) => all keys in the current context
            expressions = [
                Variable(entity, name) for name in context.keys(extra=False)
            ]

        str_expressions = [str(e) for e in expressions]
        if 'id' not in str_expressions:
            str_expressions.insert(0, 'id')
            expressions.insert(0, Variable(entity, 'id'))
            id_pos = 0
        else:
            id_pos = str_expressions.index('id')

        #        if (self.periods is not None and len(self.periods) and
        #            'period' not in str_expressions):
        #            str_expressions.insert(0, 'period')
        #            expressions.insert(0, Variable('period'))
        #            id_pos += 1

        columns = []
        for expr in expressions:
            if filter_value is False:
                # dtype does not matter much
                expr_value = np.empty(0)
            else:
                expr_value = expr_eval(expr, context)
                if (filter_value is not None
                        and isinstance(expr_value, np.ndarray)
                        and expr_value.shape):
                    expr_value = expr_value[filter_value]
            columns.append(expr_value)

        ids = columns[id_pos]
        if isinstance(ids, np.ndarray) and ids.shape:
            numrows = len(ids)
        else:
            # FIXME: we need a test for this case (no idea how this can happen)
            numrows = 1

        # expand scalar columns to full columns in memory
        # TODO: handle or explicitly reject columns wh ndim > 1
        for idx, col in enumerate(columns):
            dtype = None
            if not isinstance(col, np.ndarray):
                dtype = type(col)
            elif not col.shape:
                dtype = col.dtype.type
            if dtype is not None:
                # TODO: try using itertools.repeat instead as it seems to be a
                # bit faster and would consume less memory (however, it might
                # not play very well with Pandas.to_csv)
                newcol = np.full(numrows, col, dtype=dtype)
                columns[idx] = newcol

        if limit is not None:
            assert isinstance(limit, (int, long))
            columns = [col[:limit] for col in columns]

        data = izip(*columns)
        table = chain([str_expressions], data) if header else data
        return PrettyTable(table, missing)
Beispiel #46
0
    def align_link(self, context, score, need, filter, take, leave,
                   expressions, possible_values, errors, frac_need, link,
                   secondary_axis, method):
        target_context = link._target_context(context)

        need, expressions, possible_values = \
            self._eval_need(context, need, expressions, possible_values,
                            target_context)

        # handle secondary axis
        if isinstance(secondary_axis, Expr):
            axis_name = str(secondary_axis)
            try:
                secondary_axis = need.dim_names.index(axis_name)
            except ValueError:
                raise ValueError("invalid value for secondary_axis: there is "
                                 "no axis named '%s' in the need array" %
                                 axis_name)
        else:
            if secondary_axis >= need.ndim:
                raise Exception("%d is an invalid value for secondary_axis: "
                                "it should be smaller than the number of "
                                "dimension of the need array (%d)" %
                                (secondary_axis, need.ndim))

        # evaluate columns
        target_columns = [expr_eval(e, target_context) for e in expressions]
        # this is a one2many, so the link column is on the target side
        link_column = target_context[link._link_field]

        filter_expr = self._getfilter(context, filter)
        if filter_expr is not None:
            reverse_link = Many2One("reverse", link._link_field,
                                    context.entity.name)
            target_filter = LinkGet(reverse_link, filter_expr, False)
            target_filter_value = expr_eval(target_filter, target_context)

            # It is often not a good idea to pre-filter columns like this
            # because we loose information about "indices", but in this case,
            # it is fine, because we do not need that information afterwards.
            filtered_columns = [
                col[target_filter_value]
                if isinstance(col, np.ndarray) and col.shape else [col]
                for col in target_columns
            ]

            link_column = link_column[target_filter_value]
        else:
            filtered_columns = target_columns
            target_filter_value = None

        # compute labels for filtered columns
        # -----------------------------------
        # We can't use _group_labels_light because group_labels assigns labels
        # on a first come, first served basis, not using the order they are
        # in pvalues
        fcols_labels = []
        filtered_length = len(filtered_columns[0])
        unaligned = np.zeros(filtered_length, dtype=bool)
        for fcol, pvalues in zip(filtered_columns, need.pvalues):
            pvalues_index = dict((v, i) for i, v in enumerate(pvalues))
            fcol_labels = np.empty(filtered_length, dtype=np.int32)
            for i in range(filtered_length):
                value_idx = pvalues_index.get(fcol[i], -1)
                if value_idx == -1:
                    unaligned[i] = True
                fcol_labels[i] = value_idx
            fcols_labels.append(fcol_labels)

        num_unaligned = np.sum(unaligned)
        if num_unaligned:
            # further filter label columns and link_column
            validlabels = ~unaligned
            fcols_labels = [labels[validlabels] for labels in fcols_labels]
            link_column = link_column[validlabels]

            # display who are the evil ones
            ids = target_context['id']
            if target_filter_value is not None:
                filtered_ids = ids[target_filter_value]
            else:
                filtered_ids = ids
            self._display_unaligned(expressions, filtered_ids,
                                    filtered_columns, unaligned)
        else:
            del unaligned

        id_to_rownum = context.id_to_rownum
        missing_int = missing_values[int]
        source_ids = link_column

        if len(id_to_rownum):
            source_rows = id_to_rownum[source_ids]
            # filter out missing values: those where the value of the link
            # points to nowhere (-1)
            source_rows[source_ids == missing_int] = missing_int
        else:
            assert np.all(source_ids == missing_int)
            source_rows = []

        # filtered_columns are not filtered further on invalid labels
        # (num_unaligned) but this is not a problem since those will be
        # ignored by GroupBy anyway.
        # TODO: this is ugly because a groupby on "values", returns an LArray with those
        # values (ndarrays) as axes names. Ugh.
        groupby_expr = GroupBy(*filtered_columns, pvalues=possible_values)

        # FIXME: target_context is not correct, as it is not filtered while
        # filtered_columns are. Since we do not use the context "columns" it
        # mostly works but I had to disable an assertion in utils.expand
        # because the length of the context is not correct.
        num_candidates = expr_eval(groupby_expr, target_context)

        # fetch the list of linked individuals for each local individual.
        # e.g. the list of person ids for each household
        hh = np.empty(context_length(context), dtype=object)
        # we can't use .fill([]) because it reuses the same list for all
        # objects
        for i in range(len(hh)):
            hh[i] = []

        # Even though this is highly sub-optimal, the time taken to create
        # those lists of ids is very small compared to the total time taken
        # for align_other (0.2s vs 4.26), so I shouldn't care too much about
        # it for now.

        # target_row (row of person) is an index valid for *filtered/label*
        # columns !
        for target_row, source_row in enumerate(source_rows):
            if source_row == -1:
                continue
            hh[source_row].append(target_row)

        class FakeContainer(object):
            def __init__(self, length):
                self.length = length

            def __len__(self):
                return self.length

        groups = [FakeContainer(g) for g in num_candidates]
        need = need * self._get_need_correction(groups, possible_values)
        need = self._handle_frac_need(need, frac_need)
        need = self._add_past_error(context, need, errors)
        # need = np.asarray(need)
        need = np.asarray(need)
        aligned, error = \
            align_link_nd(score, need, num_candidates, hh, fcols_labels,
                          secondary_axis)
        self.past_error = error
        return aligned
Beispiel #47
0
    def compute(self, context, set1filter, set2filter, score, orderby,
                pool_size=None, algo='onebyone'):
        global matching_ctx

        if pool_size is not None:
            assert isinstance(pool_size, int)
            assert pool_size > 0

        set1filterexpr = self._getfilter(context, set1filter)
        set1filtervalue = expr_eval(set1filterexpr, context)
        set2filterexpr = self._getfilter(context, set2filter)
        set2filtervalue = expr_eval(set2filterexpr, context)
        set1len = set1filtervalue.sum()
        set2len = set2filtervalue.sum()
        print("matching with %d/%d individuals" % (set1len, set2len), end='')

        varnames = {v.name for v in score.collect_variables()}
        used_variables1 = {n for n in varnames if not n.startswith('__other_')}
        used_variables2 = {n[8:] for n in varnames if n.startswith('__other_')}

        if isinstance(orderby, str):
            assert orderby == 'EDtM'
            orderby_vars = used_variables1
        else:
            orderby_vars = {v.name for v in orderby.collect_variables()}

        if algo == 'onebyone':
            all_vars = {'id'} | used_variables1 | orderby_vars
            set1 = context.subset(set1filtervalue, all_vars, set1filterexpr)
            set2 = context.subset(set2filtervalue, {'id'} | used_variables2,
                                  set2filterexpr)

            # subset creates a dict for the current entity, so .entity_data is a
            # dict
            set1 = set1.entity_data
            set2 = set2.entity_data

            set1['__ids__'] = set1['id'].reshape(set1len, 1)
            set2['__ids__'] = set2['id'].reshape(set2len, 1)

            print()
        else:
            # optimized matching by grouping sets by values, which usually
            # means smaller sets and improved running time.
            assert algo == 'byvalue'

            # if orderby contains variables that are not used in the score
            # expression, this will effectively add variables in the
            # matching context AND group by those variables. This is correct
            # because otherwise (if we did not group by them), we could have
            # groups containing individuals with different values of the
            # ordering variables (ie the ordering would not be respected).
            set1 = group_context(used_variables1 | orderby_vars,
                                 set1filtervalue, context)
            set2 = group_context(used_variables2, set2filtervalue, context)

            # we cannot simply take the [:min(set1len, set2len)] indices like in
            # the non-optimized case and iterate over that because we don't know
            # how many groups we will need to match.
            print(" (%d/%d groups)"
                  % (context_length(set1), context_length(set2)))

        if isinstance(orderby, str):
            orderbyvalue = np.zeros(context_length(set1))
            for name in used_variables1:
                column = set1[name]
                orderbyvalue += (column - column.mean()) ** 2 / column.var()
        else:
            orderbyvalue = expr_eval(orderby, context.clone(entity_data=set1))

        # Delete variables which are not in the score expression (but in the
        # orderby expr or possibly "id") because they are no longer needed and
        # would slow things down.
        context_keep(set1, used_variables1)
        context_keep(set2, used_variables2)

        sorted_set1_indices = orderbyvalue.argsort()[::-1]

        result = np.full(context_length(context), -1, dtype=int)
        id_to_rownum = context.id_to_rownum

        # prefix all keys except __len__
        matching_ctx = {'__other_' + k if k != '__len__' else k: v
                        for k, v in set2.iteritems()}

        def match_cell(idx, sorted_idx, pool_size):
            global matching_ctx

            set2_size = context_length(matching_ctx)
            if not set2_size:
                raise StopIteration

            if pool_size is not None and set2_size > pool_size:
                pool = random.sample(xrange(set2_size), pool_size)
                local_ctx = context_subset(matching_ctx, pool)
            else:
                local_ctx = matching_ctx.copy()

            local_ctx.update((k, set1[k][sorted_idx])
                             for k in {'__ids__'} | used_variables1)

            eval_ctx = context.clone(entity_data=local_ctx)
            set2_scores = expr_eval(score, eval_ctx)
            cell2_idx = set2_scores.argmax()

            cell1ids = local_ctx['__ids__']
            cell2ids = local_ctx['__other___ids__'][cell2_idx]

            if pool_size is not None and set2_size > pool_size:
                # transform pool-local index to set/matching_ctx index
                cell2_idx = pool[cell2_idx]

            cell1size = len(cell1ids)
            cell2size = len(cell2ids)
            nb_match = min(cell1size, cell2size)

            # we could introduce a random choice here but it is not
            # much necessary. In that case, it should be done in group_context
            ids1 = cell1ids[:nb_match]
            ids2 = cell2ids[:nb_match]

            result[id_to_rownum[ids1]] = ids2
            result[id_to_rownum[ids2]] = ids1
            
            if nb_match == cell2size:
                matching_ctx = context_delete(matching_ctx, cell2_idx)
            else:
                # other variables do not need to be modified since the cell
                # only got smaller and was not deleted
                matching_ctx['__other___ids__'][cell2_idx] = cell2ids[nb_match:]

            # FIXME: the expr gets cached for the full matching_ctx at the
            # beginning and then when another women with the same values is
            # found, it thinks it can reuse the expr but it breaks because it
            # has not the correct length.

            # the current workaround is to invalidate the whole cache for the
            # current entity but this is not the right way to go.
            # * disable the cache for matching?
            # * use a local cache so that methods after matching() can use
            # what was in the cache before matching(). Shouldn't the cache be
            # stored inside the context anyway?
            expr_cache.invalidate(context.period, context.entity_name)

            if nb_match < cell1size:
                set1['__ids__'][sorted_idx] = cell1ids[nb_match:]
                match_cell(idx, sorted_idx, pool_size)
        loop_wh_progress(match_cell, sorted_set1_indices, pool_size)
        return result
Beispiel #48
0
    def compute(self, context, *expressions, **kwargs):
        if not expressions:
            raise TypeError("groupby() takes at least 1 argument")

        # TODO: allow lists/tuples of arguments to group by the combinations
        # of keys
        for expr in expressions:
            if isinstance(expr, (bool, int, float)):
                raise TypeError("groupby() does not work with constant "
                                "arguments")
            if isinstance(expr, (tuple, list)):
                raise TypeError("groupby() takes expressions as arguments, "
                                "not a list of expressions")

        # On python 3, we could clean up this code (keyword only arguments).
        expr = kwargs.pop('expr', None)
        if expr is None:
            expr = Count()

#        by = kwargs.pop('by', None)
        filter_value = kwargs.pop('filter', None)
        percent = kwargs.pop('percent', False)
        possible_values = kwargs.pop('pvalues', None)

        expr_vars = [v.name for v in collect_variables(expr)]
        labels = [str(e) for e in expressions]
        columns = [expr_eval(e, context) for e in expressions]
        columns = [expand(c, context_length(context)) for c in columns]

        if filter_value is not None:
            filtered_columns = [col[filter_value] for col in columns]
            # FIXME: use the actual filter_expr instead of not_hashable
            filtered_context = context.subset(filter_value, expr_vars,
                                              not_hashable)
        else:
            filtered_columns = columns
            filtered_context = context

        if possible_values is None:
            possible_values = [np.unique(col) for col in filtered_columns]

        # We pre-filtered columns instead of passing the filter to partition_nd
        # because it is a bit faster this way. The indices are still correct,
        # because we use them on a filtered_context.
        groups = partition_nd(filtered_columns, True, possible_values)
        if not groups:
            return LabeledArray([], labels, possible_values)

        # evaluate the expression on each group
        # we use not_hashable to avoid storing the subset in the cache
        contexts = [filtered_context.subset(indices, expr_vars, not_hashable)
                    for indices in groups]
        data = [expr_eval(expr, c) for c in contexts]

        # TODO: use group_indices_nd directly to avoid using np.unique
        # this is twice as fast (unique is very slow) but breaks because
        # the rest of the code assumes all combinations are present
#        if self.filter is not None:
#            filter_value = expr_eval(self.filter, context)
#        else:
#            filter_value = True
#
#        d = group_indices_nd(columns, filter_value)
#        pvalues = sorted(d.keys())
#        ndim = len(columns)
#        possible_values = [[pv[i] for pv in pvalues]
#                           for i in range(ndim)]
#        groups = [d[k] for k in pvalues]

        # groups is a (flat) list of list.
        # the first variable is the outer-most "loop",
        # the last one the inner most.

        # add total for each row
        len_pvalues = [len(vals) for vals in possible_values]
        width = len_pvalues[-1]
        height = prod(len_pvalues[:-1])

        rows_indices = [np.concatenate([groups[y * width + x]
                                        for x in range(width)])
                        for y in range(height)]
        cols_indices = [np.concatenate([groups[y * width + x]
                                        for y in range(height)])
                        for x in range(width)]
        cols_indices.append(np.concatenate(cols_indices))

        # evaluate the expression on each "combined" group (ie compute totals)
        row_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable)
                    for indices in rows_indices]
        row_totals = [expr_eval(expr, ctx) for ctx in row_ctxs]
        col_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable)
                    for indices in cols_indices]
        col_totals = [expr_eval(expr, ctx) for ctx in col_ctxs]

        if percent:
            # convert to np.float64 to get +-inf if total_value is int(0)
            # instead of Python's built-in behaviour of raising an exception.
            # This can happen at least when using the default expr (count())
            # and the filter yields empty groups
            total_value = np.float64(col_totals[-1])
            data = [100.0 * value / total_value for value in data]
            row_totals = [100.0 * value / total_value for value in row_totals]
            col_totals = [100.0 * value / total_value for value in col_totals]

#        if self.by or self.percent:
#            if self.percent:
#                total_value = data[-1]
#                divisors = [total_value for _ in data]
#            else:
#                num_by = len(self.by)
#                inc = prod(len_pvalues[-num_by:])
#                num_groups = len(groups)
#                num_categories = prod(len_pvalues[:-num_by])
#
#                categories_groups_idx = [range(cat_idx, num_groups, inc)
#                                         for cat_idx in range(num_categories)]
#
#                divisors = ...
#
#            data = [100.0 * value / divisor
#                    for value, divisor in izip(data, divisors)]

        # convert to a 1d array. We don't simply use data = np.array(data),
        # because if data is a list of ndarray (for example if we use
        # groupby(a, expr=id), *and* all the ndarrays have the same length,
        # the result is a 2d array instead of an array of ndarrays like we
        # need (at this point).
        arr = np.empty(len(data), dtype=type(data[0]))
        arr[:] = data
        data = arr

        # and reshape it
        data = data.reshape(len_pvalues)
        return LabeledArray(data, labels, possible_values,
                            row_totals, col_totals)