Beispiel #1
0
    def compute(self, context, filter=None):
        filter_value = filter
        if filter_value is None:
            # this is pretty inefficient, but remove without filter is not
            # common enough to bother
            filter_value = np.ones(len(context), dtype=bool)

        if not np.any(filter_value):
            return

        not_removed = ~filter_value

        entity = context.entity
        len_before = len(entity.array)

        # Shrink array & temporaries. 99% of the function time is spent here.
        entity.array.keep(not_removed)
        temp_variables = entity.temp_variables
        for name, temp_value in temp_variables.items():
            if isinstance(temp_value, np.ndarray) and temp_value.shape:
                temp_variables[name] = temp_value[not_removed]

        # update id_to_rownum
        already_removed = entity.id_to_rownum == -1
        already_removed_indices = filter_to_indices(already_removed)
        already_removed_indices_shifted = \
            already_removed_indices - np.arange(len(already_removed_indices))

        id_to_rownum = np.arange(len_before)
        id_to_rownum -= filter_value.cumsum()
        #XXX: use np.putmask(id_to_rownum, filter_value, -1)
        id_to_rownum[filter_value] = -1
        entity.id_to_rownum = np.insert(id_to_rownum,
                                        already_removed_indices_shifted,
                                        -1)
        # this version is cleaner and slightly faster but the result is also
        # slightly wrong: it eliminates ids for dead/removed individuals at
        # the end of the array and this cause bugs in time-related functions
#        ids = entity.array['id']
#        id_to_rownum = np.empty(np.max(ids) + 1, dtype=int)
#        id_to_rownum.fill(-1)
#        id_to_rownum[ids] = np.arange(len(ids), dtype=int)
#        entity.id_to_rownum = id_to_rownum
        if config.log_level == "processes":
            print("%d %s(s) removed (%d -> %d)"
                  % (filter_value.sum(), entity.name, len_before,
                     len(entity.array)),
                  end=' ')

        #TODO: in the case of remove(), we should update (take a subset of) all
        # the cache keys matching the entity, but with the current code,
        # it is most likely not worth it because the cache probably contains
        # mostly stuff we will never use.
        expr_cache.invalidate(context.period, context.entity_name)
Beispiel #2
0
    def compute(self, context, filter=None):
        filter_value = filter
        if filter_value is None:
            # this is pretty inefficient, but remove without filter is not
            # common enough to bother
            filter_value = np.ones(len(context), dtype=bool)

        if not np.any(filter_value):
            return

        not_removed = ~filter_value

        entity = context.entity
        len_before = len(entity.array)

        # Shrink array & temporaries. 99% of the function time is spent here.
        entity.array.keep(not_removed)
        temp_variables = entity.temp_variables
        for name, temp_value in temp_variables.items():
            if isinstance(temp_value, np.ndarray) and temp_value.shape:
                temp_variables[name] = temp_value[not_removed]

        # update id_to_rownum
        already_removed = entity.id_to_rownum == -1
        already_removed_indices = filter_to_indices(already_removed)
        already_removed_indices_shifted = \
            already_removed_indices - np.arange(len(already_removed_indices))

        id_to_rownum = np.arange(len_before)
        id_to_rownum -= filter_value.cumsum()
        # XXX: use np.putmask(id_to_rownum, filter_value, -1)
        id_to_rownum[filter_value] = -1
        entity.id_to_rownum = np.insert(id_to_rownum,
                                        already_removed_indices_shifted, -1)
        # this version is cleaner and slightly faster but the result is also
        # slightly wrong: it eliminates ids for dead/removed individuals at
        # the end of the array and this cause bugs in time-related functions
        #        ids = entity.array['id']
        #        id_to_rownum = np.full(np.max(ids) + 1, -1, dtype=int)
        #        id_to_rownum[ids] = np.arange(len(ids), dtype=int)
        #        entity.id_to_rownum = id_to_rownum
        if config.log_level == "processes":
            print("%d %s(s) removed (%d -> %d)" %
                  (filter_value.sum(), entity.name, len_before,
                   len(entity.array)),
                  end=' ')

        # TODO: in the case of remove(), we should update (take a subset of) all
        # the cache keys matching the entity, but with the current code,
        # it is most likely not worth it because the cache probably contains
        # mostly stuff we will never use.
        expr_cache.invalidate(context.period, context.entity_name)
Beispiel #3
0
    def align_no_link(self, context, score, need, filter, take, leave,
                      expressions, possible_values, errors, frac_need, link,
                      secondary_axis, method):

        ctx_length = context_length(context)

        need, expressions, possible_values = \
            self._eval_need(context, need, expressions, possible_values)

        filter_value = expr_eval(self._getfilter(context, filter), context)

        if filter_value is not None:
            num_to_align = np.sum(filter_value)
        else:
            num_to_align = ctx_length

        # retrieve the columns we need to work with
        if expressions:
            columns = [expr_eval(expr, context) for expr in expressions]
            if filter_value is not None:
                groups = partition_nd(columns, filter_value, possible_values)
            else:
                groups = partition_nd(columns, True, possible_values)
        else:
            columns = []
            if filter_value is not None:
                groups = [filter_to_indices(filter_value)]
            else:
                groups = [np.arange(num_to_align)]

        # the sum is not necessarily equal to len(a), because some individuals
        # might not fit in any group (eg if some alignment data is missing)
        if sum(len(g) for g in groups) < num_to_align:
            unaligned = np.ones(ctx_length, dtype=bool)
            if filter_value is not None:
                unaligned[~filter_value] = False
            for member_indices in groups:
                unaligned[member_indices] = False
            self._display_unaligned(expressions, context['id'], columns,
                                    unaligned)

        # noinspection PyAugmentAssignment
        need = need * self._get_need_correction(groups, possible_values)
        need = self._handle_frac_need(need, frac_need)
        need = self._add_past_error(context, need, errors)
        need = np.asarray(need)
        # FIXME: either handle past_error in no link (currently, the past
        #        error is added... but never computed, so always 0 !) or raise
        #        an error in case errors='carry" is used with no link.
        return align_get_indices_nd(ctx_length, groups, need, filter_value,
                                    score, take, leave, method)
Beispiel #4
0
    def align_no_link(self, context, score, need, filter, take, leave,
                      expressions, possible_values, errors, frac_need, link,
                      secondary_axis, method):

        ctx_length = context_length(context)

        need, expressions, possible_values = \
            self._eval_need(context, need, expressions, possible_values)

        filter_value = expr_eval(self._getfilter(context, filter), context)

        if filter_value is not None:
            num_to_align = np.sum(filter_value)
        else:
            num_to_align = ctx_length

        # retrieve the columns we need to work with
        if expressions:
            columns = [expr_eval(expr, context) for expr in expressions]
            if filter_value is not None:
                groups = partition_nd(columns, filter_value, possible_values)
            else:
                groups = partition_nd(columns, True, possible_values)
        else:
            columns = []
            if filter_value is not None:
                groups = [filter_to_indices(filter_value)]
            else:
                groups = [np.arange(num_to_align)]

        # the sum is not necessarily equal to len(a), because some individuals
        # might not fit in any group (eg if some alignment data is missing)
        if sum(len(g) for g in groups) < num_to_align:
            unaligned = np.ones(ctx_length, dtype=bool)
            if filter_value is not None:
                unaligned[~filter_value] = False
            for member_indices in groups:
                unaligned[member_indices] = False
            self._display_unaligned(expressions, context['id'], columns,
                                    unaligned)

        # noinspection PyAugmentAssignment
        need = need * self._get_need_correction(groups, possible_values)
        need = self._handle_frac_need(need, frac_need)
        need = self._add_past_error(context, need, errors)
        need = np.asarray(need)
        # FIXME: either handle past_error in no link (currently, the past
        #        error is added... but never computed, so always 0 !) or raise
        #        an error in case errors='carry" is used with no link.
        return align_get_indices_nd(ctx_length, groups, need, filter_value,
                                    score, take, leave, method)
Beispiel #5
0
    def align_no_link(self, context):
        ctx_length = context_length(context)

        scores = expr_eval(self.expr, context)

        need, expressions, possible_values = self._eval_need(context)

        filter_value = expr_eval(self._getfilter(context), context)
        take_filter = expr_eval(self.take_filter, context)
        leave_filter = expr_eval(self.leave_filter, context)

        if filter_value is not None:
            num_to_align = np.sum(filter_value)
        else:
            num_to_align = ctx_length

        if expressions:
            # retrieve the columns we need to work with
            columns = [expr_eval(expr, context) for expr in expressions]
            if filter_value is not None:
                groups = partition_nd(columns, filter_value, possible_values)
            else:
                groups = partition_nd(columns, True, possible_values)
        else:
            columns = []
            if filter_value is not None:
                groups = [filter_to_indices(filter_value)]
            else:
                groups = [np.arange(num_to_align)]

        # the sum is not necessarily equal to len(a), because some individuals
        # might not fit in any group (eg if some alignment data is missing)
        if sum(len(g) for g in groups) < num_to_align:
            unaligned = np.ones(ctx_length, dtype=bool)
            if filter_value is not None:
                unaligned[~filter_value] = False
            for member_indices in groups:
                unaligned[member_indices] = False
            self._display_unaligned(expressions, context['id'], columns,
                                    unaligned)

        #noinspection PyAugmentAssignment
        need = need * self._get_need_correction(groups, possible_values)
        need = self._handle_frac_need(need)
        need = self._add_past_error(need, context)

        return align_get_indices_nd(ctx_length, groups, need, filter_value,
                                    scores, take_filter, leave_filter)
Beispiel #6
0
    def run(self, context):
        filter_value = expr_eval(self.filter, context)

        if not np.any(filter_value):
            return

        not_removed = ~filter_value

        entity = context['__entity__']
        len_before = len(entity.array)

        #FIXME: this allocates a new (slightly smaller) array. The old
        # array is only discarded when the gc does its job, effectively
        # doubling the peak memory usage for the main array for a while.
        # Seems like another good reason to store columns separately.

        # Shrink array & temporaries. 99% of the function time is spent here.
        entity.array = entity.array[not_removed]
        temp_variables = entity.temp_variables
        for name, temp_value in temp_variables.iteritems():
            if isinstance(temp_value, np.ndarray) and temp_value.shape:
                temp_variables[name] = temp_value[not_removed]

        # update id_to_rownum
        already_removed = entity.id_to_rownum == -1
        already_removed_indices = filter_to_indices(already_removed)
        already_removed_indices_shifted = already_removed_indices - \
                                  np.arange(len(already_removed_indices))

        id_to_rownum = np.arange(len_before)
        id_to_rownum -= filter_value.cumsum()
        #XXX: use np.putmask(id_to_rownum, filter_value, -1)
        id_to_rownum[filter_value] = -1
        entity.id_to_rownum = np.insert(id_to_rownum,
                                        already_removed_indices_shifted,
                                        -1)
        # this version is cleaner and slightly faster but the result is also
        # slightly different: it eliminates ids for dead/removed individuals
        # and this cause bugs in time-related functions
#        ids = entity.array['id']
#        id_to_rownum = np.empty(np.max(ids) + 1, dtype=int)
#        id_to_rownum.fill(-1)
#        id_to_rownum[ids] = np.arange(len(ids), dtype=int)
#        entity.id_to_rownum = id_to_rownum

        print "%d %s(s) removed (%d -> %d)" % (filter_value.sum(), entity.name,
                                               len_before, len(entity.array)),
Beispiel #7
0
    def run(self, context):
        filter_value = expr_eval(self.filter, context)

        if not np.any(filter_value):
            return

        not_removed = ~filter_value

        entity = context['__entity__']
        len_before = len(entity.array)

        # Shrink array & temporaries. 99% of the function time is spent here.
        entity.array.keep(not_removed)
        temp_variables = entity.temp_variables
        for name, temp_value in temp_variables.items():
            if isinstance(temp_value, np.ndarray) and temp_value.shape:
                temp_variables[name] = temp_value[not_removed]

        # update id_to_rownum
        already_removed = entity.id_to_rownum == -1
        already_removed_indices = filter_to_indices(already_removed)
        already_removed_indices_shifted = \
            already_removed_indices - np.arange(len(already_removed_indices))

        id_to_rownum = np.arange(len_before)
        id_to_rownum -= filter_value.cumsum()
        #XXX: use np.putmask(id_to_rownum, filter_value, -1)
        id_to_rownum[filter_value] = -1
        entity.id_to_rownum = np.insert(id_to_rownum,
                                        already_removed_indices_shifted,
                                        -1)
        # this version is cleaner and slightly faster but the result is also
        # slightly wrong: it eliminates ids for dead/removed individuals at
        # the end of the array and this cause bugs in time-related functions
#        ids = entity.array['id']
#        id_to_rownum = np.empty(np.max(ids) + 1, dtype=int)
#        id_to_rownum.fill(-1)
#        id_to_rownum[ids] = np.arange(len(ids), dtype=int)
#        entity.id_to_rownum = id_to_rownum
        if config.log_level == "processes":
            print("%d %s(s) removed (%d -> %d)" % (filter_value.sum(), entity.name,
                                                   len_before, len(entity.array)),
                                                   end=' ')
Beispiel #8
0
def align_get_indices_nd(ctx_length, groups, need, filter_value, score,
                         take_filter=None, leave_filter=None):
    assert isinstance(need, np.ndarray) and \
           issubclass(need.dtype.type, np.integer)
    assert score is None or isinstance(score, (bool, int, float, np.ndarray))

    if filter_value is not None:
        bool_filter_value = filter_value.copy()
    else:
        bool_filter_value = True

    maybe_filter = bool_filter_value
    if take_filter is not None:
        #XXX: I wonder if users would prefer if filter_value was taken into
        # account or not. This only impacts what it displayed on the console,
        # but still...
        take = np.sum(take_filter)

        #XXX: it would probably be faster to leave the filters as boolean
        # vector and do
        #     take_members = take_filter[member_indices]
        #     group_always = member_indices[take_members]
        # instead of
        #     group_always = np.intersect1d(members_indices, take_indices,
        #                                   assume_unique=True)
        take_indices = filter_to_indices(take_filter & bool_filter_value)
        maybe_filter &= ~take_filter
    else:
        take = 0
        take_indices = None

    if leave_filter is not None:
        leave = np.sum(leave_filter)
        maybe_filter &= ~leave_filter
    else:
        leave = 0

    if take_filter is not None or leave_filter is not None:
        maybe_indices = filter_to_indices(maybe_filter)
    else:
        maybe_indices = None

    total_underflow = 0
    total_overflow = 0
    total_affected = 0

    aligned = np.zeros(ctx_length, dtype=bool)
    for members_indices, group_need in izip(groups, need.flat):
        if len(members_indices):
            affected = group_need
            total_affected += affected

            if take_indices is not None:
                group_always = np.intersect1d(members_indices, take_indices,
                                              assume_unique=True)
                num_always = len(group_always)
                aligned[group_always] = True
            else:
                num_always = 0

            if affected > num_always:
                if maybe_indices is not None:
                    group_maybe_indices = np.intersect1d(members_indices,
                                                         maybe_indices,
                                                         assume_unique=True)
                else:
                    group_maybe_indices = members_indices
                if isinstance(score, np.ndarray):
                    maybe_members_rank_value = score[group_maybe_indices]
                    #TODO: use np.partition (np1.8+)
                    sorted_local_indices = np.argsort(maybe_members_rank_value)
                    sorted_global_indices = \
                        group_maybe_indices[sorted_local_indices]
                else:
                    # if the score expression is a constant, we don't need to
                    # sort indices. In that case, the alignment will first take
                    # the individuals created last (highest id).
                    sorted_global_indices = group_maybe_indices

                # maybe_to_take is always > 0
                maybe_to_take = affected - num_always
                # take the last X individuals (ie those with the highest score)
                indices_to_take = sorted_global_indices[-maybe_to_take:]

                underflow = maybe_to_take - len(indices_to_take)
                if underflow > 0:
                    total_underflow += underflow
                aligned[indices_to_take] = True
            elif affected < num_always:
                total_overflow += num_always - affected

    num_aligned = int(np.sum(aligned))
    # this assertion is only valid in the non weighted case
    assert num_aligned == total_affected + total_overflow - total_underflow
    num_partitioned = sum(len(g) for g in groups)
    if config.debug and config.log_level == "processes":
        print(" %d/%d" % (num_aligned, num_partitioned), end=" ")
        if (take_filter is not None) or (leave_filter is not None):
            print("[take %d, leave %d]" % (take, leave), end=" ")
        if total_underflow:
            print("UNDERFLOW: %d" % total_underflow, end=" ")
        if total_overflow:
            print("OVERFLOW: %d" % total_overflow, end=" ")

    return aligned
Beispiel #9
0
def align_get_indices_nd(ctx_length,
                         groups,
                         need,
                         filter_value,
                         score,
                         take_filter=None,
                         leave_filter=None,
                         method="bysorting"):
    assert isinstance(need, np.ndarray) and \
        np.issubdtype(need.dtype, np.integer)
    assert score is None or isinstance(score, (bool, int, float, np.ndarray))

    if filter_value is not None:
        bool_filter_value = filter_value.copy()
    else:
        bool_filter_value = True

    maybe_filter = bool_filter_value
    if take_filter is not None:
        take_intersect = take_filter & bool_filter_value
        take = np.sum(take_intersect)

        # XXX: it would probably be faster to leave the filters as boolean
        # vector and do
        #     take_members = take_filter[member_indices]
        #     group_always = member_indices[take_members]
        # instead of
        #     group_always = np.intersect1d(members_indices, take_indices,
        #                                   assume_unique=True)
        take_indices = filter_to_indices(take_intersect)
        maybe_filter &= ~take_filter
    else:
        take = 0
        take_indices = None

    if leave_filter is not None:
        leave = np.sum(leave_filter & bool_filter_value)
        maybe_filter &= ~leave_filter
    else:
        leave = 0

    if take_filter is not None or leave_filter is not None:
        maybe_indices = filter_to_indices(maybe_filter)
    else:
        maybe_indices = None

    total_underflow = 0
    total_overflow = 0
    total_affected = 0

    aligned = np.zeros(ctx_length, dtype=bool)

    if method == 'sidewalk':
        score_max = max(score)
        score_min = min(score)
        if score_max > 1 or score_min < 0:
            raise Exception("""Score values are in the interval {} - {}.
Sidewalk alignment can only be used with a score between 0 and 1.
You may want to use a logistic function.
""".format(score_min, score_max))

    for members_indices, group_need in izip(groups, need.flat):
        if len(members_indices):
            affected = group_need
            total_affected += affected

            if take_indices is not None:
                group_always = np.intersect1d(members_indices,
                                              take_indices,
                                              assume_unique=True)
                num_always = len(group_always)
                aligned[group_always] = True
            else:
                num_always = 0

            if affected > num_always:
                if maybe_indices is not None:
                    group_maybe_indices = np.intersect1d(members_indices,
                                                         maybe_indices,
                                                         assume_unique=True)
                else:
                    group_maybe_indices = members_indices
                if isinstance(score, np.ndarray):
                    if method == 'bysorting':
                        maybe_members_rank_value = score[group_maybe_indices]
                        # TODO: use np.partition (np1.8+)
                        sorted_local_indices = np.argsort(
                            maybe_members_rank_value)
                        sorted_global_indices = \
                            group_maybe_indices[sorted_local_indices]
                    elif method == 'sidewalk':
                        sorted_global_indices = \
                            np.random.permutation(group_maybe_indices)
                else:
                    # if the score expression is a constant, we don't need to
                    # sort indices. In that case, the alignment will first take
                    # the individuals created last (highest id).
                    sorted_global_indices = group_maybe_indices

                # maybe_to_take is always > 0
                maybe_to_take = affected - num_always
                if method == 'bysorting':
                    # take the last X individuals (ie those with the highest
                    # score)
                    indices_to_take = sorted_global_indices[-maybe_to_take:]
                elif method == 'sidewalk':
                    proba_sum = sum(score[sorted_global_indices])
                    if maybe_to_take > round(proba_sum):
                        raise ValueError(
                            "Cannot use 'sidewalk' with need = {} > sum of probabilities = {}"
                            .format(maybe_to_take, proba_sum))
                    u = np.random.uniform() + np.arange(maybe_to_take)
                    # on the random sample, score are cumulated and then, we
                    # extract indices of each value before each value of u
                    cum_score = np.cumsum(score[sorted_global_indices])
                    indices_to_take = \
                        sorted_global_indices[np.searchsorted(cum_score, u)]

                underflow = maybe_to_take - len(indices_to_take)
                if underflow > 0:
                    total_underflow += underflow
                aligned[indices_to_take] = True
            elif affected < num_always:
                total_overflow += num_always - affected

    num_aligned = int(np.sum(aligned))
    # this assertion is only valid in the non weighted case
    assert num_aligned == total_affected + total_overflow - total_underflow
    num_partitioned = sum(len(g) for g in groups)
    if config.log_level == "processes":
        print(" %d/%d" % (num_aligned, num_partitioned), end=" ")
        if (take_filter is not None) or (leave_filter is not None):
            print("[take %d, leave %d]" % (take, leave), end=" ")
        if total_underflow:
            print("UNDERFLOW: %d" % total_underflow, end=" ")
        if total_overflow:
            print("OVERFLOW: %d" % total_overflow, end=" ")

    return aligned
Beispiel #10
0
def align_get_indices_nd(ctx_length, groups, need, filter_value, score,
                         take_filter=None, leave_filter=None,
                         method="bysorting"):
    assert isinstance(need, np.ndarray) and \
        np.issubdtype(need.dtype, np.integer)
    assert score is None or isinstance(score, (bool, int, float, np.ndarray))

    if filter_value is not None:
        bool_filter_value = filter_value.copy()
    else:
        bool_filter_value = True

    maybe_filter = bool_filter_value
    if take_filter is not None:
        take_intersect = take_filter & bool_filter_value
        take = np.sum(take_intersect)

        # XXX: it would probably be faster to leave the filters as boolean
        # vector and do
        #     take_members = take_filter[member_indices]
        #     group_always = member_indices[take_members]
        # instead of
        #     group_always = np.intersect1d(members_indices, take_indices,
        #                                   assume_unique=True)
        take_indices = filter_to_indices(take_intersect)
        maybe_filter &= ~take_filter
    else:
        take = 0
        take_indices = None

    if leave_filter is not None:
        leave = np.sum(leave_filter & bool_filter_value)
        maybe_filter &= ~leave_filter
    else:
        leave = 0

    if take_filter is not None or leave_filter is not None:
        maybe_indices = filter_to_indices(maybe_filter)
    else:
        maybe_indices = None

    total_underflow = 0
    total_overflow = 0
    total_affected = 0

    aligned = np.zeros(ctx_length, dtype=bool)

    if method == 'sidewalk':
        score_max = max(score)
        score_min = min(score)
        if score_max > 1 or score_min < 0:
            raise Exception("""Score values are in the interval {} - {}.
Sidewalk alignment can only be used with a score between 0 and 1.
You may want to use a logistic function.
""".format(score_min, score_max))

    for members_indices, group_need in izip(groups, need.flat):
        if len(members_indices):
            affected = group_need
            total_affected += affected

            if take_indices is not None:
                group_always = np.intersect1d(members_indices, take_indices,
                                              assume_unique=True)
                num_always = len(group_always)
                aligned[group_always] = True
            else:
                num_always = 0

            if affected > num_always:
                if maybe_indices is not None:
                    group_maybe_indices = np.intersect1d(members_indices,
                                                         maybe_indices,
                                                         assume_unique=True)
                else:
                    group_maybe_indices = members_indices
                if isinstance(score, np.ndarray):
                    if method == 'bysorting':
                        maybe_members_rank_value = score[group_maybe_indices]
                        # TODO: use np.partition (np1.8+)
                        sorted_local_indices = np.argsort(maybe_members_rank_value)
                        sorted_global_indices = \
                            group_maybe_indices[sorted_local_indices]
                    elif method == 'sidewalk':
                        sorted_global_indices = \
                            np.random.permutation(group_maybe_indices)
                else:
                    # if the score expression is a constant, we don't need to
                    # sort indices. In that case, the alignment will first take
                    # the individuals created last (highest id).
                    sorted_global_indices = group_maybe_indices

                # maybe_to_take is always > 0
                maybe_to_take = affected - num_always
                if method == 'bysorting':
                    # take the last X individuals (ie those with the highest
                    # score)
                    indices_to_take = sorted_global_indices[-maybe_to_take:]
                elif method == 'sidewalk':
                    proba_sum = sum(score[sorted_global_indices])
                    if maybe_to_take > round(proba_sum):
                        raise ValueError(
                            "Cannot use 'sidewalk' with need = {} > sum of probabilities = {}".format(
                                maybe_to_take, proba_sum
                                )
                            )
                    u = np.random.uniform() + np.arange(maybe_to_take)
                    # on the random sample, score are cumulated and then, we
                    # extract indices of each value before each value of u
                    cum_score = np.cumsum(score[sorted_global_indices])
                    indices_to_take = \
                        sorted_global_indices[np.searchsorted(cum_score, u)]

                underflow = maybe_to_take - len(indices_to_take)
                if underflow > 0:
                    total_underflow += underflow
                aligned[indices_to_take] = True
            elif affected < num_always:
                total_overflow += num_always - affected

    num_aligned = int(np.sum(aligned))
    # this assertion is only valid in the non weighted case
    assert num_aligned == total_affected + total_overflow - total_underflow
    num_partitioned = sum(len(g) for g in groups)
    if config.log_level == "processes":
        print(" %d/%d" % (num_aligned, num_partitioned), end=" ")
        if (take_filter is not None) or (leave_filter is not None):
            print("[take %d, leave %d]" % (take, leave), end=" ")
        if total_underflow:
            print("UNDERFLOW: %d" % total_underflow, end=" ")
        if total_overflow:
            print("OVERFLOW: %d" % total_overflow, end=" ")

    return aligned
Beispiel #11
0
def align_get_indices_nd(ctx_length, groups, need, filter_value, score,
                         take_filter=None, leave_filter=None, method="default"):
    assert isinstance(need, np.ndarray) and \
           issubclass(need.dtype.type, np.integer)
    assert score is None or isinstance(score, (bool, int, float, np.ndarray))
    assert method in ("default","sidewalk") # to delete as the check is done earlier ?
    
    if filter_value is not None:
        bool_filter_value = filter_value.copy()
    else:
        bool_filter_value = True

    maybe_filter = bool_filter_value
    if take_filter is not None:
        #XXX: I wonder if users would prefer if filter_value was taken into
        # account or not. This only impacts what it displayed on the console,
        # but still...
        take = np.sum(take_filter)

        #XXX: it would probably be faster to leave the filters as boolean
        # vector and do
        #     take_members = take_filter[member_indices]
        #     group_always = member_indices[take_members]
        # instead of
        #     group_always = np.intersect1d(members_indices, take_indices,
        #                                   assume_unique=True)
        take_indices = filter_to_indices(take_filter & bool_filter_value)
        maybe_filter &= ~take_filter
    else:
        take = 0
        take_indices = None

    if leave_filter is not None:
        leave = np.sum(leave_filter)
        maybe_filter &= ~leave_filter
    else:
        leave = 0

    if take_filter is not None or leave_filter is not None:
        maybe_indices = filter_to_indices(maybe_filter)
    else:
        maybe_indices = None

    total_underflow = 0
    total_overflow = 0
    total_affected = 0

    aligned = np.zeros(ctx_length, dtype=bool)
    for members_indices, group_need in izip(groups, need.flat):
        if len(members_indices):
            affected = group_need
            total_affected += affected
            if take_indices is not None:
                group_always = np.intersect1d(members_indices, take_indices,
                                              assume_unique=True)
                num_always = len(group_always)
                aligned[group_always] = True
            else:
                num_always = 0
                  
            if affected > num_always:
                if maybe_indices is not None:
                    group_maybe_indices = np.intersect1d(members_indices,
                                                         maybe_indices,
                                                         assume_unique=True)
                else:
                    group_maybe_indices = members_indices
                if isinstance(score, np.ndarray):
                    if method=='default':
                        maybe_members_rank_value = score[group_maybe_indices]
                        sorted_local_indices = np.argsort(maybe_members_rank_value)
                        sorted_global_indices = \
                            group_maybe_indices[sorted_local_indices]
                    elif method=='sidewalk':
                        if max(score[group_maybe_indices]) > 1 or min(score[group_maybe_indices]) < 0:
                            raise Exception("Sidewalk method can be used only with a"
                                            " score between 0 and 1. You may want to use"
                                            " a logistic function ")
                        local_indices = range(len(group_maybe_indices))
                        sorted_local_indices = np.random.permutation(local_indices)
                        sorted_global_indices = \
                            group_maybe_indices[sorted_local_indices]
                else:
                    # if the score expression is a constant, we don't need to
                    # sort indices. In that case, the alignment will first take
                    # the individuals created last (highest id).
                    sorted_global_indices = group_maybe_indices

                # maybe_to_take is always > 0
                maybe_to_take = affected - num_always
                if method=='default':
                    # take the last X individuals (ie those with the highest score)
                    indices_to_take = sorted_global_indices[-maybe_to_take:]
                elif method=='sidewalk':
                    U=random()+np.arange(maybe_to_take)             
                    #on the random sample, score are cumulated and then, we extract indices
                    #of each value before each value of U
                    print (method)
                    indices_to_take = np.searchsorted(np.cumsum(score[sorted_local_indices]), U)
                    indices_to_take = sorted_global_indices[indices_to_take] 
                underflow = maybe_to_take - len(indices_to_take)
                if underflow > 0:
                    total_underflow += underflow
                aligned[indices_to_take] = True
            elif affected < num_always:
                total_overflow += num_always - affected

    num_aligned = np.sum(aligned)
    # this assertion is only valid in the non weighted case
    assert num_aligned == total_affected + total_overflow - total_underflow
    num_partitioned = sum(len(g) for g in groups)
    print(" %d/%d" % (num_aligned, num_partitioned), end=" ")
    if (take_filter is not None) or (leave_filter is not None):
        print("[take %d, leave %d]" % (take, leave), end=" ")
    if total_underflow:
        print("UNDERFLOW: %d" % total_underflow, end=" ")
    if total_overflow:
        print("OVERFLOW: %d" % total_overflow, end=" ")

    return aligned
Beispiel #12
0
    def compute(self, context, score, need, filter=None, take=None, leave=None,
                expressions=None, possible_values=None, errors='default',
                frac_need='uniform', link=None, secondary_axis=None):
        # need is a single scalar
        # if not isinstance(need, (tuple, list, np.ndarray)):
        if np.isscalar(need):
            need = [need]

        # need is a non-ndarray sequence
        if isinstance(need, (tuple, list)):
            need = np.array(need)
        assert isinstance(need, np.ndarray)

        if expressions is None:
            expressions = []

        if possible_values is None:
            possible_values = []
        else:
            possible_values = [np.array(pv) for pv in possible_values]

        if frac_need not in ('uniform', 'cutoff', 'round'):
            cls = ValueError if isinstance(frac_need, basestring) else TypeError
            raise cls("frac_need should be one of: 'uniform', 'cutoff' or "
                      "'round'")

        scores = expr_eval(self.expr, context)
        filter_value = expr_eval(self._getfilter(context), context)
        
        need, expressions, possible_values = self._eval_need(context, scores, filter_value)

        take_filter = expr_eval(self.take_filter, context)
        leave_filter = expr_eval(self.leave_filter, context)

        if filter_value is not None:
            num_to_align = np.sum(filter_value)
        else:
            num_to_align = ctx_length

        # retrieve the columns we need to work with
        if expressions:
            columns = [expr_eval(expr, context) for expr in expressions]
            if filter_value is not None:
                groups = partition_nd(columns, filter_value, possible_values)
            else:
                groups = partition_nd(columns, True, possible_values)
        else:
            columns = []
            if filter_value is not None:
                groups = [filter_to_indices(filter_value)]
            else:
                groups = [np.arange(num_to_align)]

        # the sum is not necessarily equal to len(a), because some individuals
        # might not fit in any group (eg if some alignment data is missing)
        if sum(len(g) for g in groups) < num_to_align:
            unaligned = np.ones(ctx_length, dtype=bool)
            if filter_value is not None:
                unaligned[~filter_value] = False
            for member_indices in groups:
                unaligned[member_indices] = False
            self._display_unaligned(expressions, context['id'], columns,
                                    unaligned)

        periodicity = context['periodicity']
        if context['format_date'] == 'year0':
            periodicity = periodicity*12 #give right periodicity/self.periodicity_given whereas self.periodicity_given/12 doesn't
            
        #sign(self.periodicity_given) = sign(periodicity)
        self.periodicity_given = \
            self.periodicity_given * (self.periodicity_given*periodicity)/abs(self.periodicity_given*periodicity)
        if gcd(periodicity,self.periodicity_given) not in [periodicity,self.periodicity_given] : 
            raise( "mix of quarter and triannual impossible")
        
        need = need*periodicity/self.periodicity_given
        if scores is not None:            
            scores = scores*periodicity/self.periodicity_given 
            
        #noinspection PyAugmentAssignment
        need = need * self._get_need_correction(groups, possible_values)
        need = self._handle_frac_need(need, method=frac_need)
        need = self._add_past_error(context, need, method=errors)

        return align_get_indices_nd(ctx_length, groups, need, filter_value,
                                    scores, take_filter, leave_filter, method=self.method)
Beispiel #13
0
    def align_no_link(self, context):
        ctx_length = context_length(context)

        scores = expr_eval(self.expr, context)
        filter_value = expr_eval(self._getfilter(context), context)
        
        need, expressions, possible_values = self._eval_need(context, scores, filter_value)

        take_filter = expr_eval(self.take_filter, context)
        leave_filter = expr_eval(self.leave_filter, context)

        if filter_value is not None:
            num_to_align = np.sum(filter_value)
        else:
            num_to_align = ctx_length

        if expressions:
            # retrieve the columns we need to work with zzzz
            columns = [expr_eval(expr, context) for expr in expressions]
            
#             #bidouille pour age si on passe a un format yyyymm
#             str_expressions = [str(e) for e in expressions]
#             if 'age' in str_expressions:
#                 age_axis_num = str_expressions.index('age')
#                 columns[age_axis_num] = columns[age_axis_num]/100
                
                   
            if filter_value is not None:
                groups = partition_nd(columns, filter_value, possible_values)
            else:
                groups = partition_nd(columns, True, possible_values)
        else:
            if filter_value is not None:
                groups = [filter_to_indices(filter_value)]
            else:
                groups = [np.arange(num_to_align)]

        # the sum is not necessarily equal to len(a), because some individuals
        # might not fit in any group (eg if some alignment data is missing)
        if sum(len(g) for g in groups) < num_to_align:
            unaligned = np.ones(ctx_length, dtype=bool)
            if filter_value is not None:
                unaligned[~filter_value] = False
            for member_indices in groups:
                unaligned[member_indices] = False
            self._display_unaligned(expressions, context['id'], columns,
                                    unaligned)

        periodicity = context['periodicity']
        if context['format_date'] == 'year0':
            periodicity = periodicity*12 #give right periodicity/self.periodicity_given whereas self.periodicity_given/12 doesn't
            
        #sign(self.periodicity_given) = sign(periodicity)
        self.periodicity_given = \
            self.periodicity_given * (self.periodicity_given*periodicity)/abs(self.periodicity_given*periodicity)
        if gcd(periodicity,self.periodicity_given) not in [periodicity,self.periodicity_given] : 
            raise( "mix of quarter and triannual impossible")
        
        need = need*periodicity/self.periodicity_given
        if scores is not None:            
            scores = scores*periodicity/self.periodicity_given 
            
        need = need * self._get_need_correction(groups, possible_values)
        need = self._handle_frac_need(need)
        need = self._add_past_error(need, context)

        return align_get_indices_nd(ctx_length, groups, need, filter_value,
                                    scores, take_filter, leave_filter, method=self.method)