Exemple #1
0
    def compute(self, context, set1filter, set2filter, orderby1, orderby2):
        set1filterexpr = self._getfilter(context, set1filter)
        set1filtervalue = expr_eval(set1filterexpr, context)
        set2filterexpr = self._getfilter(context, set2filter)
        set2filtervalue = expr_eval(set2filterexpr, context)
        set1len = set1filtervalue.sum()
        set2len = set2filtervalue.sum()
        numtomatch = min(set1len, set2len)
        print("matching with %d/%d individuals" % (set1len, set2len))
        result = np.full(context_length(context), -1, dtype=int)
        if not numtomatch:
            return result

        sorted_set1_indices = orderby1[set1filtervalue].argsort()[-numtomatch:]
        sorted_set2_indices = orderby2[set2filtervalue].argsort()[-numtomatch:]

        set1ids = context['id'][set1filtervalue]
        set2ids = context['id'][set2filtervalue]

        id_to_rownum = context.id_to_rownum
        id1 = set1ids[sorted_set1_indices]
        id2 = set2ids[sorted_set2_indices]
        # cannot use sorted_setX_indices because those are "local" indices
        result[id_to_rownum[id1]] = id2
        result[id_to_rownum[id2]] = id1
        return result
Exemple #2
0
    def align_no_link(self, context, score, need, filter, take, leave,
                      expressions, possible_values, errors, frac_need, link,
                      secondary_axis, method):

        ctx_length = context_length(context)

        need, expressions, possible_values = \
            self._eval_need(context, need, expressions, possible_values)

        filter_value = expr_eval(self._getfilter(context, filter), context)

        if filter_value is not None:
            num_to_align = np.sum(filter_value)
        else:
            num_to_align = ctx_length

        # retrieve the columns we need to work with
        if expressions:
            columns = [expr_eval(expr, context) for expr in expressions]
            if filter_value is not None:
                groups = partition_nd(columns, filter_value, possible_values)
            else:
                groups = partition_nd(columns, True, possible_values)
        else:
            columns = []
            if filter_value is not None:
                groups = [filter_to_indices(filter_value)]
            else:
                groups = [np.arange(num_to_align)]

        # the sum is not necessarily equal to len(a), because some individuals
        # might not fit in any group (eg if some alignment data is missing)
        if sum(len(g) for g in groups) < num_to_align:
            unaligned = np.ones(ctx_length, dtype=bool)
            if filter_value is not None:
                unaligned[~filter_value] = False
            for member_indices in groups:
                unaligned[member_indices] = False
            self._display_unaligned(expressions, context['id'], columns,
                                    unaligned)

        # noinspection PyAugmentAssignment
        need = need * self._get_need_correction(groups, possible_values)
        need = self._handle_frac_need(need, frac_need)
        need = self._add_past_error(context, need, errors)
        need = np.asarray(need)
        # FIXME: either handle past_error in no link (currently, the past
        #        error is added... but never computed, so always 0 !) or raise
        #        an error in case errors='carry" is used with no link.
        return align_get_indices_nd(ctx_length, groups, need, filter_value,
                                    score, take, leave, method)
Exemple #3
0
    def align_no_link(self, context, score, need, filter, take, leave,
                      expressions, possible_values, errors, frac_need, link,
                      secondary_axis, method):

        ctx_length = context_length(context)

        need, expressions, possible_values = \
            self._eval_need(context, need, expressions, possible_values)

        filter_value = expr_eval(self._getfilter(context, filter), context)

        if filter_value is not None:
            num_to_align = np.sum(filter_value)
        else:
            num_to_align = ctx_length

        # retrieve the columns we need to work with
        if expressions:
            columns = [expr_eval(expr, context) for expr in expressions]
            if filter_value is not None:
                groups = partition_nd(columns, filter_value, possible_values)
            else:
                groups = partition_nd(columns, True, possible_values)
        else:
            columns = []
            if filter_value is not None:
                groups = [filter_to_indices(filter_value)]
            else:
                groups = [np.arange(num_to_align)]

        # the sum is not necessarily equal to len(a), because some individuals
        # might not fit in any group (eg if some alignment data is missing)
        if sum(len(g) for g in groups) < num_to_align:
            unaligned = np.ones(ctx_length, dtype=bool)
            if filter_value is not None:
                unaligned[~filter_value] = False
            for member_indices in groups:
                unaligned[member_indices] = False
            self._display_unaligned(expressions, context['id'], columns,
                                    unaligned)

        # noinspection PyAugmentAssignment
        need = need * self._get_need_correction(groups, possible_values)
        need = self._handle_frac_need(need, frac_need)
        need = self._add_past_error(context, need, errors)
        need = np.asarray(need)
        # FIXME: either handle past_error in no link (currently, the past
        #        error is added... but never computed, so always 0 !) or raise
        #        an error in case errors='carry" is used with no link.
        return align_get_indices_nd(ctx_length, groups, need, filter_value,
                                    score, take, leave, method)
Exemple #4
0
    def eval_rows(self, source_rows, expr_value, weights_value, context):
        fill = get_default_value(expr_value)
        result = np.full(context_length(context), fill, dtype=expr_value.dtype)

        id_sort_indices = np.argsort(source_rows)
        sorted_rownum = source_rows[id_sort_indices]
        sorted_values = expr_value[id_sort_indices]
        groups = groupby(zip(sorted_rownum, sorted_values), key=itemgetter(0))
        aggregate_func = self.aggregate_func
        for rownum, values in groups:
            if rownum == -1:
                continue
            # Note that v[n] is faster than using an itemgetter, even with map
            result[rownum] = aggregate_func(v[1] for v in values)
        return result
Exemple #5
0
    def eval_rows(self, source_rows, expr_value, weights_value, context):
        fill = get_default_value(expr_value)
        result = np.full(context_length(context), fill, dtype=expr_value.dtype)

        id_sort_indices = np.argsort(source_rows)
        sorted_rownum = source_rows[id_sort_indices]
        sorted_values = expr_value[id_sort_indices]
        groups = groupby(zip(sorted_rownum, sorted_values), key=itemgetter(0))
        aggregate_func = self.aggregate_func
        for rownum, values in groups:
            if rownum == -1:
                continue
            # Note that v[n] is faster than using an itemgetter, even with map
            result[rownum] = aggregate_func(v[1] for v in values)
        return result
Exemple #6
0
    def _eval_args(self, context):
        args, kwargs = NumpyCreateArray._eval_args(self, context)
        if 'size' in self.argspec.args:
            pos = self.argspec.args.index('size')
            size = args[pos]

            # The original functions return a scalar when size is None, and an
            # array of length one when size is 1.
            # TODO: users should have a way to have the "size=None" behavior. We
            # could differentiate whether None was explicitly passed or comes
            # from the default value (as we did previously: 'size' not in
            # kwargs), but I do not think it is a good idea. Adding a new
            # "sentinel" value (e.g. -1 or "scalar") is probably better.
            if size is None:
                args = args[:pos] + (context_length(context),) + args[pos + 1:]
        return args, kwargs
Exemple #7
0
    def compute(self, context, filter=None, weights=None):
        if filter is not None:
            filter = np.asarray(filter)

            # TODO: check this at "compile" time (in __init__), though for that we need to know the type of all
            # temporary variables first
            if not np.issubdtype(filter.dtype, np.bool_):
                raise ValueError("count filter must be a boolean expression")
        if weights is not None:
            weights = np.asarray(weights)

        if filter is None and weights is None:
            return context_length(context)
        elif weights is None:
            return np.sum(filter)
        elif filter is None:
            return np.sum(weights)
        else:
            return np.sum(weights * filter)
Exemple #8
0
    def compute(self, context, filter=None, weights=None):
        if filter is not None:
            filter = np.asarray(filter)

            # TODO: check this at "compile" time (in __init__), though for that we need to know the type of all
            # temporary variables first
            if not np.issubdtype(filter.dtype, np.bool_):
                raise ValueError("count filter must be a boolean expression")
        if weights is not None:
            weights = np.asarray(weights)

        if filter is None and weights is None:
            return context_length(context)
        elif weights is None:
            return np.sum(filter)
        elif filter is None:
            return np.sum(weights)
        else:
            return np.sum(weights * filter)
Exemple #9
0
    def eval_rows(self, source_rows, expr_value, weights_value, context):
        # We can't use a negative value because that is not allowed by
        # bincount, and using a value too high will uselessly increase the size
        # of the array returned by bincount
        idx_for_missing = context_length(context)

        missing_int = missing_values[int]

        # filter out missing values: those where the object pointed to does not
        # exist anymore (the id corresponds to -1 in id_to_rownum)
        # XXX: use np.putmask(source_rows, source_ids == missing_int,
        #                    missing_int)
        source_rows[source_rows == missing_int] = idx_for_missing
        # ideally this should be done with expr_eval in Aggregate.compute but then it is no longer generic
        # and I need to refactor the whole internal API
        if weights_value is not None:
            expr_value = expr_value * weights_value
        counts = self.count(source_rows, expr_value, weights_value)
        # missing entries are filled with zeros, so it works nicely in this case (sum/count)
        counts.resize(idx_for_missing)
        return counts
Exemple #10
0
    def eval_rows(self, source_rows, expr_value, weights_value, context):
        # We can't use a negative value because that is not allowed by
        # bincount, and using a value too high will uselessly increase the size
        # of the array returned by bincount
        idx_for_missing = context_length(context)

        missing_int = missing_values[int]

        # filter out missing values: those where the object pointed to does not
        # exist anymore (the id corresponds to -1 in id_to_rownum)
        # XXX: use np.putmask(source_rows, source_ids == missing_int,
        #                    missing_int)
        source_rows[source_rows == missing_int] = idx_for_missing
        # ideally this should be done with expr_eval in Aggregate.compute but then it is no longer generic
        # and I need to refactor the whole internal API
        if weights_value is not None:
            expr_value = expr_value * weights_value
        counts = self.count(source_rows, expr_value, weights_value)
        # missing entries are filled with zeros, so it works nicely in this case (sum/count)
        counts.resize(idx_for_missing)
        return counts
Exemple #11
0
    def fill_missing_values(ids, values, context, filler='auto'):
        """
        ids: ids present in past period
        values: values in past period
        context: current period context
        """

        if filler is 'auto':
            filler = get_default_value(values)
        result = np.full(context_length(context), filler, dtype=values.dtype)
        if len(ids):
            id_to_rownum = context.id_to_rownum
            # if there was more objects in the past than in the current
            # period. Currently, remove() keeps old ids, so this never
            # happens, but if we ever change remove(), we'll need to add
            # such a check everywhere we use id_to_rownum
            # invalid_ids = ids > len(id_to_rownum)
            # if np.any(invalid_ids):
            #     fix ids
            rows = id_to_rownum[ids]
            safe_put(result, rows, values)
        return result
Exemple #12
0
    def fill_missing_values(ids, values, context, filler='auto'):
        """
        ids: ids present in past period
        values: values in past period
        context: current period context
        """

        if filler is 'auto':
            filler = get_default_value(values)
        result = np.full(context_length(context), filler, dtype=values.dtype)
        if len(ids):
            id_to_rownum = context.id_to_rownum
            # if there was more objects in the past than in the current
            # period. Currently, remove() keeps old ids, so this never
            # happens, but if we ever change remove(), we'll need to add
            # such a check everywhere we use id_to_rownum
            # invalid_ids = ids > len(id_to_rownum)
            # if np.any(invalid_ids):
            #     fix ids
            rows = id_to_rownum[ids]
            safe_put(result, rows, values)
        return result
Exemple #13
0
        def match_cell(idx, sorted_idx, pool_size):
            global matching_ctx

            set2_size = context_length(matching_ctx)
            if not set2_size:
                raise StopIteration

            if pool_size is not None and set2_size > pool_size:
                pool = random.sample(range(set2_size), pool_size)
                local_ctx = context_subset(matching_ctx, pool)
            else:
                local_ctx = matching_ctx.copy()

            local_ctx.update((k, set1[k][sorted_idx])
                             for k in {'__ids__'} | used_variables1)

            eval_ctx = context.clone(entity_data=local_ctx)
            set2_scores = expr_eval(score, eval_ctx)
            cell2_idx = set2_scores.argmax()

            cell1ids = local_ctx['__ids__']
            cell2ids = local_ctx['__other___ids__'][cell2_idx]

            if pool_size is not None and set2_size > pool_size:
                # transform pool-local index to set/matching_ctx index
                cell2_idx = pool[cell2_idx]

            cell1size = len(cell1ids)
            cell2size = len(cell2ids)
            nb_match = min(cell1size, cell2size)

            # we could introduce a random choice here but it is not
            # much necessary. In that case, it should be done in group_context
            ids1 = cell1ids[:nb_match]
            ids2 = cell2ids[:nb_match]

            result[id_to_rownum[ids1]] = ids2
            result[id_to_rownum[ids2]] = ids1

            if nb_match == cell2size:
                matching_ctx = context_delete(matching_ctx, cell2_idx)
            else:
                # other variables do not need to be modified since the cell
                # only got smaller and was not deleted
                matching_ctx['__other___ids__'][cell2_idx] = cell2ids[
                    nb_match:]

            # FIXME: the expr gets cached for the full matching_ctx at the
            # beginning and then when another women with the same values is
            # found, it thinks it can reuse the expr but it breaks because it
            # has not the correct length.

            # the current workaround is to invalidate the whole cache for the
            # current entity but this is not the right way to go.
            # * disable the cache for matching?
            # * use a local cache so that methods after matching() can use
            # what was in the cache before matching(). Shouldn't the cache be
            # stored inside the context anyway?
            expr_cache.invalidate(context.period, context.entity_name)

            if nb_match < cell1size:
                set1['__ids__'][sorted_idx] = cell1ids[nb_match:]
                match_cell(idx, sorted_idx, pool_size)
Exemple #14
0
    def align_link(self, context, score, need, filter, take, leave,
                   expressions, possible_values, errors, frac_need, link,
                   secondary_axis, method):
        target_context = link._target_context(context)

        need, expressions, possible_values = \
            self._eval_need(context, need, expressions, possible_values,
                            target_context)

        # handle secondary axis
        if isinstance(secondary_axis, Expr):
            axis_name = str(secondary_axis)
            try:
                secondary_axis = need.axes.names.index(axis_name)
            except ValueError:
                raise ValueError("invalid value for secondary_axis: there is "
                                 "no axis named '%s' in the need array"
                                 % axis_name)
        elif isinstance(secondary_axis, int):
            if secondary_axis >= need.ndim:
                raise Exception("%d is an invalid value for secondary_axis: "
                                "it should be smaller than the number of "
                                "dimension of the need array (%d)"
                                % (secondary_axis, need.ndim))
        else:
            assert secondary_axis is None

        # evaluate columns
        target_columns = [expr_eval(e, target_context) for e in expressions]
        # this is a one2many, so the link column is on the target side
        link_column = target_context[link._link_field]

        filter_expr = self._getfilter(context, filter)
        if filter_expr is not None:
            reverse_link = Many2One("reverse", link._link_field,
                                    context.entity.name)
            target_filter = LinkGet(reverse_link, filter_expr, False)
            target_filter_value = expr_eval(target_filter, target_context)

            # It is often not a good idea to pre-filter columns like this
            # because we loose information about "indices", but in this case,
            # it is fine, because we do not need that information afterwards.
            filtered_columns = [col[target_filter_value]
                                if isinstance(col, np.ndarray) and col.shape
                                else [col]
                                for col in target_columns]

            link_column = link_column[target_filter_value]
        else:
            filtered_columns = target_columns
            target_filter_value = None

        # compute labels for filtered columns
        # -----------------------------------
        # We can't use _group_labels_light because group_labels assigns labels
        # on a first come, first served basis, not using the order they are
        # in pvalues
        fcols_labels = []
        filtered_length = len(filtered_columns[0])
        unaligned = np.zeros(filtered_length, dtype=bool)
        # XXX: probably needs to use "possible_values" instead
        for fcol, pvalues in zip(filtered_columns, need.axes.labels):
            pvalues_index = dict((v, i) for i, v in enumerate(pvalues))
            fcol_labels = np.empty(filtered_length, dtype=np.int32)
            for i in range(filtered_length):
                value_idx = pvalues_index.get(fcol[i], -1)
                if value_idx == -1:
                    unaligned[i] = True
                fcol_labels[i] = value_idx
            fcols_labels.append(fcol_labels)

        num_unaligned = np.sum(unaligned)
        if num_unaligned:
            # further filter label columns and link_column
            validlabels = ~unaligned
            fcols_labels = [labels[validlabels] for labels in fcols_labels]
            link_column = link_column[validlabels]

            # display who are the evil ones
            ids = target_context['id']
            if target_filter_value is not None:
                filtered_ids = ids[target_filter_value]
            else:
                filtered_ids = ids
            self._display_unaligned(expressions, filtered_ids,
                                    filtered_columns, unaligned)
        else:
            del unaligned

        id_to_rownum = context.id_to_rownum
        missing_int = missing_values[int]
        source_ids = link_column

        if len(id_to_rownum):
            source_rows = id_to_rownum[source_ids]
            # filter out missing values: those where the value of the link
            # points to nowhere (-1)
            source_rows[source_ids == missing_int] = missing_int
        else:
            assert np.all(source_ids == missing_int)
            source_rows = []

        # filtered_columns are not filtered further on invalid labels
        # (num_unaligned) but this is not a problem since those will be
        # ignored by GroupBy anyway.
        # TODO: the result of this is ugly because a groupby on *values*, returns an LArray with those
        # values (ndarrays) as axes *names*. Ugh.
        groupby_expr = GroupBy(*filtered_columns, pvalues=possible_values)

        # FIXME: target_context is not correct, as it is not filtered while
        # filtered_columns are. Since we do not use the context "columns" it
        # mostly works but I had to disable an assertion in utils.expand
        # because the length of the context is not correct.
        num_candidates = expr_eval(groupby_expr, target_context)

        # fetch the list of linked individuals for each local individual.
        # e.g. the list of person ids for each household
        hh = np.empty(context_length(context), dtype=object)
        # we can't use .fill([]) because it reuses the same list for all
        # objects
        for i in range(len(hh)):
            hh[i] = []

        # Even though this is highly sub-optimal, the time taken to create
        # those lists of ids is very small compared to the total time taken
        # for align_other (0.2s vs 4.26), so I shouldn't care too much about
        # it for now.

        # target_row (row of person) is an index valid for *filtered/label*
        # columns !
        for target_row, source_row in enumerate(source_rows):
            if source_row == -1:
                continue
            hh[source_row].append(target_row)

        class FakeContainer(object):
            def __init__(self, length):
                self.length = length

            def __len__(self):
                return self.length
        groups = [FakeContainer(g) for g in num_candidates]
        need = need * self._get_need_correction(groups, possible_values)
        need = self._handle_frac_need(need, frac_need)
        need = self._add_past_error(context, need, errors)
        need = np.asarray(need)
        aligned, error = \
            align_link_nd(score, need, num_candidates, hh, fcols_labels,
                          secondary_axis)
        self.past_error = error
        return aligned
Exemple #15
0
    def compute(self,
                context,
                entity_name=None,
                filter=None,
                number=None,
                **kwargs):
        if filter is not None and number is not None:
            # Having neither is allowed, though, as there can be a contextual
            # filter. Also, there is no reason to prevent the whole
            # population giving birth, even though the usefulness of such
            # usage seem dubious.
            raise ValueError("new() 'filter' and 'number' arguments are "
                             "mutually exclusive")
        source_entity = context.entity
        if entity_name is None:
            target_entity = source_entity
        else:
            target_entity = context.entities[entity_name]

        # target context is the context where the new individuals will be
        # created
        if target_entity is source_entity:
            target_context = context
        else:
            # we do need to copy the data (.extra) because we will insert into
            # the entity.array anyway => fresh_data=True
            target_context = context.clone(fresh_data=True,
                                           entity_name=target_entity.name)

        filter_expr = self._getfilter(context, filter)
        if filter_expr is not None:
            to_give_birth = expr_eval(filter_expr, context)
            num_birth = to_give_birth.sum()
        elif number is not None:
            to_give_birth = None
            num_birth = number
        else:
            to_give_birth = np.ones(len(context), dtype=bool)
            num_birth = len(context)

        array = target_entity.array
        default_values = target_entity.fields.default_values

        id_to_rownum = target_entity.id_to_rownum
        num_individuals = len(id_to_rownum)

        children = self._initial_values(array, to_give_birth, num_birth,
                                        default_values)
        if num_birth:
            children['id'] = np.arange(num_individuals,
                                       num_individuals + num_birth)
            children['period'] = context.period

            used_variables = [
                v.name for v in self._collect_kwargs_variables(kwargs)
            ]
            if to_give_birth is None:
                assert not used_variables
                child_context = context.empty(num_birth)
            else:
                child_context = context.subset(to_give_birth, used_variables,
                                               filter_expr)
            for k, v in kwargs.items():
                if k not in array.dtype.names:
                    print("WARNING: {} is unknown, ignoring it!".format(k))
                    continue
                children[k] = expr_eval(v, child_context)

        add_individuals(target_context, children)

        expr_cache.invalidate(context.period, context.entity_name)

        # result is the ids of the new individuals corresponding to the source
        # entity
        if to_give_birth is not None:
            result = np.full(context_length(context), -1, dtype=int)
            if source_entity is target_entity:
                extra_bools = np.zeros(num_birth, dtype=bool)
                to_give_birth = np.concatenate((to_give_birth, extra_bools))
            # Note that np.place is a bit faster, but is currently buggy when
            # working with columns of structured arrays.
            # See https://github.com/numpy/numpy/issues/2462
            result[to_give_birth] = children['id']
            return result
        else:
            return None
Exemple #16
0
    def compute(self, context, entity_name=None, filter=None, number=None,
                **kwargs):
        if filter is not None and number is not None:
            # Having neither is allowed, though, as there can be a contextual
            # filter. Also, there is no reason to prevent the whole
            # population giving birth, even though the usefulness of such
            # usage seem dubious.
            raise ValueError("new() 'filter' and 'number' arguments are "
                             "mutually exclusive")
        source_entity = context.entity
        if entity_name is None:
            target_entity = source_entity
        else:
            target_entity = context.entities[entity_name]

        # target context is the context where the new individuals will be
        # created
        if target_entity is source_entity:
            target_context = context
        else:
            # we do need to copy the data (.extra) because we will insert into
            # the entity.array anyway => fresh_data=True
            target_context = context.clone(fresh_data=True,
                                           entity_name=target_entity.name)

        filter_expr = self._getfilter(context, filter)
        if filter_expr is not None:
            to_give_birth = expr_eval(filter_expr, context)
            num_birth = to_give_birth.sum()
        elif number is not None:
            to_give_birth = None
            num_birth = number
        else:
            to_give_birth = np.ones(len(context), dtype=bool)
            num_birth = len(context)

        array = target_entity.array
        default_values = target_entity.fields.default_values

        id_to_rownum = target_entity.id_to_rownum
        num_individuals = len(id_to_rownum)

        children = self._initial_values(array, to_give_birth, num_birth,
                                        default_values)
        if num_birth:
            children['id'] = np.arange(num_individuals,
                                       num_individuals + num_birth)
            children['period'] = context.period

            used_variables = [v.name for v in
                              self._collect_kwargs_variables(kwargs)]
            if to_give_birth is None:
                assert not used_variables
                child_context = context.empty(num_birth)
            else:
                child_context = context.subset(to_give_birth, used_variables,
                                               filter_expr)
            for k, v in kwargs.items():
                if k not in array.dtype.names:
                    print("WARNING: {} is unknown, ignoring it!".format(k))
                    continue
                children[k] = expr_eval(v, child_context)

        add_individuals(target_context, children)

        expr_cache.invalidate(context.period, context.entity_name)

        # result is the ids of the new individuals corresponding to the source
        # entity
        if to_give_birth is not None:
            result = np.full(context_length(context), -1, dtype=int)
            if source_entity is target_entity:
                extra_bools = np.zeros(num_birth, dtype=bool)
                to_give_birth = np.concatenate((to_give_birth, extra_bools))
            # Note that np.place is a bit faster, but is currently buggy when
            # working with columns of structured arrays.
            # See https://github.com/numpy/numpy/issues/2462
            result[to_give_birth] = children['id']
            return result
        else:
            return None
Exemple #17
0
    def compute(self, context, *expressions, **kwargs):
        if not expressions:
            raise TypeError("groupby() takes at least 1 argument")

        # TODO: allow lists/tuples of arguments to group by the combinations
        # of keys
        for expr in expressions:
            if isinstance(expr, (bool, int, float)):
                raise TypeError("groupby() does not work with constant "
                                "arguments")
            if isinstance(expr, (tuple, list)):
                raise TypeError("groupby() takes expressions as arguments, "
                                "not a list of expressions")

        # On python 3, we could clean up this code (keyword only arguments).
        expr = kwargs.pop('expr', None)
        if expr is None:
            expr = Count()

#        by = kwargs.pop('by', None)
        filter_value = kwargs.pop('filter', None)
        percent = kwargs.pop('percent', False)
        possible_values = kwargs.pop('pvalues', None)
        totals = kwargs.pop('totals', True)

        expr_vars = [v.name for v in collect_variables(expr)]
        labels = [str(e) for e in expressions]
        columns = [expr_eval(e, context) for e in expressions]
        columns = [expand(c, context_length(context)) for c in columns]

        if filter_value is not None:
            filtered_columns = [col[filter_value] for col in columns]
            # FIXME: use the actual filter_expr instead of not_hashable
            filtered_context = context.subset(filter_value, expr_vars,
                                              not_hashable)
        else:
            filtered_columns = columns
            filtered_context = context

        if possible_values is None:
            possible_values = [np.unique(col) for col in filtered_columns]

        # We pre-filtered columns instead of passing the filter to partition_nd
        # because it is a bit faster this way. The indices are still correct,
        # because we use them on a filtered_context.
        groups = partition_nd(filtered_columns, True, possible_values)
        if not groups:
            # return la.LArray([], labels, possible_values)
            return la.LArray([])

        # evaluate the expression on each group
        # we use not_hashable to avoid storing the subset in the cache
        contexts = [
            filtered_context.subset(indices, expr_vars, not_hashable)
            for indices in groups
        ]
        data = [expr_eval(expr, c) for c in contexts]

        # TODO: use group_indices_nd directly to avoid using np.unique
        # this is twice as fast (unique is very slow) but breaks because
        # the rest of the code assumes all combinations are present
        #        if self.filter is not None:
        #            filter_value = expr_eval(self.filter, context)
        #        else:
        #            filter_value = True
        #
        #        d = group_indices_nd(columns, filter_value)
        #        pvalues = sorted(d.keys())
        #        ndim = len(columns)
        #        possible_values = [[pv[i] for pv in pvalues]
        #                           for i in range(ndim)]
        #        groups = [d[k] for k in pvalues]

        # groups is a (flat) list of list.
        # the first variable is the outer-most "loop",
        # the last one the inner most.

        # add total for each row
        len_pvalues = [len(vals) for vals in possible_values]

        if percent:
            totals = True

        if totals:
            width = len_pvalues[-1]
            height = prod(len_pvalues[:-1])
            rows_indices = [
                np.concatenate([groups[y * width + x] for x in range(width)])
                for y in range(height)
            ]
            cols_indices = [
                np.concatenate([groups[y * width + x] for y in range(height)])
                for x in range(width)
            ]
            cols_indices.append(np.concatenate(cols_indices))

            # evaluate the expression on each "combined" group (ie compute totals)
            row_ctxs = [
                filtered_context.subset(indices, expr_vars, not_hashable)
                for indices in rows_indices
            ]
            row_totals = [expr_eval(expr, ctx) for ctx in row_ctxs]
            col_ctxs = [
                filtered_context.subset(indices, expr_vars, not_hashable)
                for indices in cols_indices
            ]
            col_totals = [expr_eval(expr, ctx) for ctx in col_ctxs]
        else:
            row_totals = None
            col_totals = None

        if percent:
            # convert to np.float64 to get +-inf if total_value is int(0)
            # instead of Python's built-in behaviour of raising an exception.
            # This can happen at least when using the default expr (count())
            # and the filter yields empty groups
            total_value = np.float64(col_totals[-1])
            data = [100.0 * value / total_value for value in data]
            row_totals = [100.0 * value / total_value for value in row_totals]
            col_totals = [100.0 * value / total_value for value in col_totals]


#        if self.by or self.percent:
#            if self.percent:
#                total_value = data[-1]
#                divisors = [total_value for _ in data]
#            else:
#                num_by = len(self.by)
#                inc = prod(len_pvalues[-num_by:])
#                num_groups = len(groups)
#                num_categories = prod(len_pvalues[:-num_by])
#
#                categories_groups_idx = [range(cat_idx, num_groups, inc)
#                                         for cat_idx in range(num_categories)]
#
#                divisors = ...
#
#            data = [100.0 * value / divisor
#                    for value, divisor in zip(data, divisors)]

# convert to a 1d array. We don't simply use data = np.array(data),
# because if data is a list of ndarray (for example if we use
# groupby(a, expr=id), *and* all the ndarrays have the same length,
# the result is a 2d array instead of an array of ndarrays like we
# need (at this point).
        arr = np.empty(len(data), dtype=type(data[0]))
        arr[:] = data
        data = arr

        # and reshape it
        data = data.reshape(len_pvalues)
        axes = [
            la.Axis(axis_labels, axis_name)
            for axis_name, axis_labels in zip(labels, possible_values)
        ]
        # FIXME: also handle totals
        return la.LArray(data, axes)
Exemple #18
0
    def align_link(self, context, score, need, filter, take, leave,
                   expressions, possible_values, errors, frac_need, link,
                   secondary_axis, method):
        target_context = link._target_context(context)

        need, expressions, possible_values = \
            self._eval_need(context, need, expressions, possible_values,
                            target_context)

        # handle secondary axis
        if isinstance(secondary_axis, Expr):
            axis_name = str(secondary_axis)
            try:
                secondary_axis = need.axes.names.index(axis_name)
            except ValueError:
                raise ValueError("invalid value for secondary_axis: there is "
                                 "no axis named '%s' in the need array" %
                                 axis_name)
        elif isinstance(secondary_axis, int):
            if secondary_axis >= need.ndim:
                raise Exception("%d is an invalid value for secondary_axis: "
                                "it should be smaller than the number of "
                                "dimension of the need array (%d)" %
                                (secondary_axis, need.ndim))
        else:
            assert secondary_axis is None

        # evaluate columns
        target_columns = [expr_eval(e, target_context) for e in expressions]
        # this is a one2many, so the link column is on the target side
        link_column = target_context[link._link_field]

        filter_expr = self._getfilter(context, filter)
        if filter_expr is not None:
            reverse_link = Many2One("reverse", link._link_field,
                                    context.entity.name)
            target_filter = LinkGet(reverse_link, filter_expr, False)
            target_filter_value = expr_eval(target_filter, target_context)

            # It is often not a good idea to pre-filter columns like this
            # because we loose information about "indices", but in this case,
            # it is fine, because we do not need that information afterwards.
            filtered_columns = [
                col[target_filter_value]
                if isinstance(col, np.ndarray) and col.shape else [col]
                for col in target_columns
            ]

            link_column = link_column[target_filter_value]
        else:
            filtered_columns = target_columns
            target_filter_value = None

        # compute labels for filtered columns
        # -----------------------------------
        # We can't use _group_labels_light because group_labels assigns labels
        # on a first come, first served basis, not using the order they are
        # in pvalues
        fcols_labels = []
        filtered_length = len(filtered_columns[0])
        unaligned = np.zeros(filtered_length, dtype=bool)
        # XXX: probably needs to use "possible_values" instead
        for fcol, pvalues in zip(filtered_columns, need.axes.labels):
            pvalues_index = dict((v, i) for i, v in enumerate(pvalues))
            fcol_labels = np.empty(filtered_length, dtype=np.int32)
            for i in range(filtered_length):
                value_idx = pvalues_index.get(fcol[i], -1)
                if value_idx == -1:
                    unaligned[i] = True
                fcol_labels[i] = value_idx
            fcols_labels.append(fcol_labels)

        num_unaligned = np.sum(unaligned)
        if num_unaligned:
            # further filter label columns and link_column
            validlabels = ~unaligned
            fcols_labels = [labels[validlabels] for labels in fcols_labels]
            link_column = link_column[validlabels]

            # display who are the evil ones
            ids = target_context['id']
            if target_filter_value is not None:
                filtered_ids = ids[target_filter_value]
            else:
                filtered_ids = ids
            self._display_unaligned(expressions, filtered_ids,
                                    filtered_columns, unaligned)
        else:
            del unaligned

        id_to_rownum = context.id_to_rownum
        missing_int = missing_values[int]
        source_ids = link_column

        if len(id_to_rownum):
            source_rows = id_to_rownum[source_ids]
            # filter out missing values: those where the value of the link
            # points to nowhere (-1)
            source_rows[source_ids == missing_int] = missing_int
        else:
            assert np.all(source_ids == missing_int)
            source_rows = []

        # filtered_columns are not filtered further on invalid labels
        # (num_unaligned) but this is not a problem since those will be
        # ignored by GroupBy anyway.
        # TODO: the result of this is ugly because a groupby on *values*, returns an LArray with those
        # values (ndarrays) as axes *names*. Ugh.
        groupby_expr = GroupBy(*filtered_columns, pvalues=possible_values)

        # FIXME: target_context is not correct, as it is not filtered while
        # filtered_columns are. Since we do not use the context "columns" it
        # mostly works but I had to disable an assertion in utils.expand
        # because the length of the context is not correct.
        num_candidates = expr_eval(groupby_expr, target_context)

        # fetch the list of linked individuals for each local individual.
        # e.g. the list of person ids for each household
        hh = np.empty(context_length(context), dtype=object)
        # we can't use .fill([]) because it reuses the same list for all
        # objects
        for i in range(len(hh)):
            hh[i] = []

        # Even though this is highly sub-optimal, the time taken to create
        # those lists of ids is very small compared to the total time taken
        # for align_other (0.2s vs 4.26), so I shouldn't care too much about
        # it for now.

        # target_row (row of person) is an index valid for *filtered/label*
        # columns !
        for target_row, source_row in enumerate(source_rows):
            if source_row == -1:
                continue
            hh[source_row].append(target_row)

        class FakeContainer(object):
            def __init__(self, length):
                self.length = length

            def __len__(self):
                return self.length

        groups = [FakeContainer(g) for g in num_candidates]
        need = need * self._get_need_correction(groups, possible_values)
        need = self._handle_frac_need(need, frac_need)
        need = self._add_past_error(context, need, errors)
        need = np.asarray(need)
        aligned, error = \
            align_link_nd(score, need, num_candidates, hh, fcols_labels,
                          secondary_axis)
        self.past_error = error
        return aligned
Exemple #19
0
    def compute(self, context, *expressions, **kwargs):
        if not expressions:
            raise TypeError("groupby() takes at least 1 argument")

        # TODO: allow lists/tuples of arguments to group by the combinations
        # of keys
        for expr in expressions:
            if isinstance(expr, (bool, int, float)):
                raise TypeError("groupby() does not work with constant "
                                "arguments")
            if isinstance(expr, (tuple, list)):
                raise TypeError("groupby() takes expressions as arguments, "
                                "not a list of expressions")

        # On python 3, we could clean up this code (keyword only arguments).
        expr = kwargs.pop('expr', None)
        if expr is None:
            expr = Count()

#        by = kwargs.pop('by', None)
        filter_value = kwargs.pop('filter', None)
        percent = kwargs.pop('percent', False)
        possible_values = kwargs.pop('pvalues', None)
        totals = kwargs.pop('totals', True)

        expr_vars = [v.name for v in collect_variables(expr)]
        labels = [str(e) for e in expressions]
        columns = [expr_eval(e, context) for e in expressions]
        columns = [expand(c, context_length(context)) for c in columns]

        if filter_value is not None:
            filtered_columns = [col[filter_value] for col in columns]
            # FIXME: use the actual filter_expr instead of not_hashable
            filtered_context = context.subset(filter_value, expr_vars,
                                              not_hashable)
        else:
            filtered_columns = columns
            filtered_context = context

        if possible_values is None:
            possible_values = [np.unique(col) for col in filtered_columns]

        # We pre-filtered columns instead of passing the filter to partition_nd
        # because it is a bit faster this way. The indices are still correct,
        # because we use them on a filtered_context.
        groups = partition_nd(filtered_columns, True, possible_values)
        if not groups:
            # return la.LArray([], labels, possible_values)
            return la.LArray([])

        # evaluate the expression on each group
        # we use not_hashable to avoid storing the subset in the cache
        contexts = [filtered_context.subset(indices, expr_vars, not_hashable)
                    for indices in groups]
        data = [expr_eval(expr, c) for c in contexts]

        # TODO: use group_indices_nd directly to avoid using np.unique
        # this is twice as fast (unique is very slow) but breaks because
        # the rest of the code assumes all combinations are present
#        if self.filter is not None:
#            filter_value = expr_eval(self.filter, context)
#        else:
#            filter_value = True
#
#        d = group_indices_nd(columns, filter_value)
#        pvalues = sorted(d.keys())
#        ndim = len(columns)
#        possible_values = [[pv[i] for pv in pvalues]
#                           for i in range(ndim)]
#        groups = [d[k] for k in pvalues]

        # groups is a (flat) list of list.
        # the first variable is the outer-most "loop",
        # the last one the inner most.

        # add total for each row
        len_pvalues = [len(vals) for vals in possible_values]

        if percent:
            totals = True

        if totals:
            width = len_pvalues[-1]
            height = prod(len_pvalues[:-1])
            rows_indices = [np.concatenate([groups[y * width + x]
                                            for x in range(width)])
                            for y in range(height)]
            cols_indices = [np.concatenate([groups[y * width + x]
                                            for y in range(height)])
                            for x in range(width)]
            cols_indices.append(np.concatenate(cols_indices))

            # evaluate the expression on each "combined" group (ie compute totals)
            row_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable)
                        for indices in rows_indices]
            row_totals = [expr_eval(expr, ctx) for ctx in row_ctxs]
            col_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable)
                        for indices in cols_indices]
            col_totals = [expr_eval(expr, ctx) for ctx in col_ctxs]
        else:
            row_totals = None
            col_totals = None

        if percent:
            # convert to np.float64 to get +-inf if total_value is int(0)
            # instead of Python's built-in behaviour of raising an exception.
            # This can happen at least when using the default expr (count())
            # and the filter yields empty groups
            total_value = np.float64(col_totals[-1])
            data = [100.0 * value / total_value for value in data]
            row_totals = [100.0 * value / total_value for value in row_totals]
            col_totals = [100.0 * value / total_value for value in col_totals]

#        if self.by or self.percent:
#            if self.percent:
#                total_value = data[-1]
#                divisors = [total_value for _ in data]
#            else:
#                num_by = len(self.by)
#                inc = prod(len_pvalues[-num_by:])
#                num_groups = len(groups)
#                num_categories = prod(len_pvalues[:-num_by])
#
#                categories_groups_idx = [range(cat_idx, num_groups, inc)
#                                         for cat_idx in range(num_categories)]
#
#                divisors = ...
#
#            data = [100.0 * value / divisor
#                    for value, divisor in zip(data, divisors)]

        # convert to a 1d array. We don't simply use data = np.array(data),
        # because if data is a list of ndarray (for example if we use
        # groupby(a, expr=id), *and* all the ndarrays have the same length,
        # the result is a 2d array instead of an array of ndarrays like we
        # need (at this point).
        arr = np.empty(len(data), dtype=type(data[0]))
        arr[:] = data
        data = arr

        # and reshape it
        data = data.reshape(len_pvalues)
        axes = [la.Axis(axis_labels, axis_name)
                for axis_name, axis_labels in zip(labels, possible_values)]
        # FIXME: also handle totals
        return la.LArray(data, axes)
Exemple #20
0
    def compute(self,
                context,
                set1filter,
                set2filter,
                score,
                orderby,
                pool_size=None,
                algo='onebyone'):
        global matching_ctx

        if pool_size is not None:
            assert isinstance(pool_size, int)
            assert pool_size > 0

        set1filterexpr = self._getfilter(context, set1filter)
        set1filtervalue = expr_eval(set1filterexpr, context)
        set2filterexpr = self._getfilter(context, set2filter)
        set2filtervalue = expr_eval(set2filterexpr, context)
        set1len = set1filtervalue.sum()
        set2len = set2filtervalue.sum()
        print("matching with %d/%d individuals" % (set1len, set2len), end='')

        varnames = {v.name for v in score.collect_variables()}
        used_variables1 = {n for n in varnames if not n.startswith('__other_')}
        used_variables2 = {n[8:] for n in varnames if n.startswith('__other_')}

        if isinstance(orderby, str):
            assert orderby == 'EDtM'
            orderby_vars = used_variables1
        else:
            orderby_vars = {v.name for v in orderby.collect_variables()}

        if algo == 'onebyone':
            all_vars = {'id'} | used_variables1 | orderby_vars
            set1 = context.subset(set1filtervalue, all_vars, set1filterexpr)
            set2 = context.subset(set2filtervalue, {'id'} | used_variables2,
                                  set2filterexpr)

            # subset creates a dict for the current entity, so .entity_data is a
            # dict
            set1 = set1.entity_data
            set2 = set2.entity_data

            set1['__ids__'] = set1['id'].reshape(set1len, 1)
            set2['__ids__'] = set2['id'].reshape(set2len, 1)

            print()
        else:
            # optimized matching by grouping sets by values, which usually
            # means smaller sets and improved running time.
            assert algo == 'byvalue'

            # if orderby contains variables that are not used in the score
            # expression, this will effectively add variables in the
            # matching context AND group by those variables. This is correct
            # because otherwise (if we did not group by them), we could have
            # groups containing individuals with different values of the
            # ordering variables (ie the ordering would not be respected).
            set1 = group_context(used_variables1 | orderby_vars,
                                 set1filtervalue, context)
            set2 = group_context(used_variables2, set2filtervalue, context)

            # we cannot simply take the [:min(set1len, set2len)] indices like in
            # the non-optimized case and iterate over that because we don't know
            # how many groups we will need to match.
            print(" (%d/%d groups)" %
                  (context_length(set1), context_length(set2)))

        if isinstance(orderby, str):
            orderbyvalue = np.zeros(context_length(set1))
            for name in used_variables1:
                column = set1[name]
                orderbyvalue += (column - column.mean())**2 / column.var()
        else:
            orderbyvalue = expr_eval(orderby, context.clone(entity_data=set1))

        # Delete variables which are not in the score expression (but in the
        # orderby expr or possibly "id") because they are no longer needed and
        # would slow things down.
        context_keep(set1, used_variables1)
        context_keep(set2, used_variables2)

        sorted_set1_indices = orderbyvalue.argsort()[::-1]

        result = np.full(context_length(context), -1, dtype=int)
        id_to_rownum = context.id_to_rownum

        # prefix all keys except __len__
        matching_ctx = {
            '__other_' + k if k != '__len__' else k: v
            for k, v in set2.items()
        }

        def match_cell(idx, sorted_idx, pool_size):
            global matching_ctx

            set2_size = context_length(matching_ctx)
            if not set2_size:
                raise StopIteration

            if pool_size is not None and set2_size > pool_size:
                pool = random.sample(range(set2_size), pool_size)
                local_ctx = context_subset(matching_ctx, pool)
            else:
                local_ctx = matching_ctx.copy()

            local_ctx.update((k, set1[k][sorted_idx])
                             for k in {'__ids__'} | used_variables1)

            eval_ctx = context.clone(entity_data=local_ctx)
            set2_scores = expr_eval(score, eval_ctx)
            cell2_idx = set2_scores.argmax()

            cell1ids = local_ctx['__ids__']
            cell2ids = local_ctx['__other___ids__'][cell2_idx]

            if pool_size is not None and set2_size > pool_size:
                # transform pool-local index to set/matching_ctx index
                cell2_idx = pool[cell2_idx]

            cell1size = len(cell1ids)
            cell2size = len(cell2ids)
            nb_match = min(cell1size, cell2size)

            # we could introduce a random choice here but it is not
            # much necessary. In that case, it should be done in group_context
            ids1 = cell1ids[:nb_match]
            ids2 = cell2ids[:nb_match]

            result[id_to_rownum[ids1]] = ids2
            result[id_to_rownum[ids2]] = ids1

            if nb_match == cell2size:
                matching_ctx = context_delete(matching_ctx, cell2_idx)
            else:
                # other variables do not need to be modified since the cell
                # only got smaller and was not deleted
                matching_ctx['__other___ids__'][cell2_idx] = cell2ids[
                    nb_match:]

            # FIXME: the expr gets cached for the full matching_ctx at the
            # beginning and then when another women with the same values is
            # found, it thinks it can reuse the expr but it breaks because it
            # has not the correct length.

            # the current workaround is to invalidate the whole cache for the
            # current entity but this is not the right way to go.
            # * disable the cache for matching?
            # * use a local cache so that methods after matching() can use
            # what was in the cache before matching(). Shouldn't the cache be
            # stored inside the context anyway?
            expr_cache.invalidate(context.period, context.entity_name)

            if nb_match < cell1size:
                set1['__ids__'][sorted_idx] = cell1ids[nb_match:]
                match_cell(idx, sorted_idx, pool_size)

        loop_wh_progress(match_cell, sorted_set1_indices, pool_size)
        return result