Exemple #1
0
            def match_one_set1_individual_pool(idx, sorted_idx, pool_size):
                global local_ctx
                
                set2_size = context_length(local_ctx)
                if not set2_size:
                    raise StopIteration
                
                if set2_size > pool_size:
                    pool = random.sample(xrange(context_length(local_ctx)), pool_size)
                else:
                    pool = range(set2_size)

                sub_local_ctx = context_subset(local_ctx, pool, None)
                sub_local_ctx.update((k, set1[k][sorted_idx]) for k in ['id'] + used_variables1)
                
                set2_scores = expr_eval(score_expr, sub_local_ctx)
    
                individual2_pool_idx = np.argmax(set2_scores)
                individual2_idx = pool[individual2_pool_idx]
                
                id1 = sub_local_ctx['id']
                id2 = local_ctx['__other_id'][individual2_idx]
    
                local_ctx = context_delete(local_ctx, individual2_idx)
    
                result[id_to_rownum[id1]] = id2
                result[id_to_rownum[id2]] = id1
Exemple #2
0
    def evaluate(self, context):
        ctx_filter = context.get('__filter__')
        id_to_rownum = context.id_to_rownum

        # at some point ctx_filter will be cached automatically, so we don't
        # need to take care of it manually here
        if ctx_filter is not None:
            set1filter = expr_eval(ctx_filter & self.set1filter, context)
            set2filter = expr_eval(ctx_filter & self.set2filter, context)
        else:
            set1filter = expr_eval(self.set1filter, context)
            set2filter = expr_eval(self.set2filter, context)

        rank1_expr = self.rank1_expr
        rank2_expr = self.rank2_expr
        used_variables1 = rank1_expr.collect_variables(context)
        used_variables2 = rank2_expr.collect_variables(context)
        used_variables1.add('id')
        used_variables2.add('id')
        set1 = context_subset(context, set1filter, used_variables1)
        set2 = context_subset(context, set2filter, used_variables2)
        set1len = set1filter.sum()
        set2len = set2filter.sum()
        tomatch = min(set1len, set2len)
        order1 = expr_eval(rank1_expr, context)
        order2 = expr_eval(rank2_expr, context)
        if not self.ascending1: 
            order1 = - order1       # reverse sorting
        if not self.ascending2:
            order2 = - order2       # reverse sorting

        sorted_set1_indices = order1[set1filter].argsort()
        sorted_set2_indices = order2[set2filter].argsort()
        idx1 = sorted_set1_indices[:tomatch]
        idx2 = sorted_set2_indices[:tomatch]
        print("matching with %d/%d individuals" % (set1len, set2len))
        
        result = np.empty(context_length(context), dtype=int)
        result.fill(-1)
        
        id1 = set1['id'][idx1]
        id2 = set2['id'][idx2]
        result[id_to_rownum[id1]] = id2
        result[id_to_rownum[id2]] = id1

        return result
Exemple #3
0
    def evaluate(self, context):
        expr = self.expr
        expr_vars = collect_variables(expr, context)

        expressions = self.expressions
        labels = [str(e) for e in expressions]
        columns = [expr_eval(e, context) for e in expressions]
        if self.filter is not None:
            filter_value = expr_eval(self.filter, context)
            #TODO: make a function out of this, I think we have this pattern
            # in several places
            filtered_columns = [col[filter_value]
                                   if isinstance(col, np.ndarray) and col.shape
                                   else [col]
                                for col in columns]
            filtered_context = context_subset(context, filter_value, expr_vars)
        else:
            filtered_columns = columns
            filtered_context = context

        possible_values = self.pvalues
        if possible_values is None:
            possible_values = [np.unique(col) for col in filtered_columns]

        # We pre-filtered columns instead of passing the filter to partition_nd
        # because it is a bit faster this way. The indices are still correct,
        # because we use them on a filtered_context.
        groups = partition_nd(filtered_columns, True, possible_values)
        if not groups:
            return LabeledArray([], labels, possible_values)

        # evaluate the expression on each group
        data = [expr_eval(expr, context_subset(filtered_context, indices,
                                               expr_vars))
                for indices in groups]

        #TODO: use group_indices_nd directly to avoid using np.unique
        # this is twice as fast (unique is very slow) but breaks because
        # the rest of the code assumes all combinations are present
#        if self.filter is not None:
#            filter_value = expr_eval(self.filter, context)
#        else:
#            filter_value = True
#
#        d = group_indices_nd(columns, filter_value)
#        pvalues = sorted(d.keys())
#        ndim = len(columns)
#        possible_values = [[pv[i] for pv in pvalues]
#                           for i in range(ndim)]
#        groups = [d[k] for k in pvalues]

        # groups is a (flat) list of list.
        # the first variable is the outer-most "loop",
        # the last one the inner most.

        # add total for each row
        len_pvalues = [len(vals) for vals in possible_values]
        width = len_pvalues[-1]
        height = prod(len_pvalues[:-1])

        rows_indices = [np.concatenate([groups[y * width + x]
                                        for x in range(width)])
                        for y in range(height)]
        cols_indices = [np.concatenate([groups[y * width + x]
                                        for y in range(height)])
                        for x in range(width)]
        cols_indices.append(np.concatenate(cols_indices))

        # evaluate the expression on each "combined" group (ie compute totals)
        row_totals = [expr_eval(expr, context_subset(filtered_context, inds,
                                                     expr_vars))
                      for inds in rows_indices]
        col_totals = [expr_eval(expr, context_subset(filtered_context, inds,
                                                     expr_vars))
                      for inds in cols_indices]

        if self.percent:
            # convert to np.float64 to get +-inf if total_value is int(0)
            # instead of Python's built-in behaviour of raising an exception.
            # This can happen at least when using the default expr (count())
            # and the filter yields empty groups
            total_value = np.float64(col_totals[-1])
            data = [100.0 * value / total_value for value in data]
            row_totals = [100.0 * value / total_value for value in row_totals]
            col_totals = [100.0 * value / total_value for value in col_totals]

#        if self.by or self.percent:
#            if self.percent:
#                total_value = data[-1]
#                divisors = [total_value for _ in data]
#            else:
#                num_by = len(self.by)
#                inc = prod(len_pvalues[-num_by:])
#                num_groups = len(groups)
#                num_categories = prod(len_pvalues[:-num_by])
#
#                categories_groups_idx = [range(cat_idx, num_groups, inc)
#                                         for cat_idx in range(num_categories)]
#
#                divisors = ...
#
#            data = [100.0 * value / divisor
#                    for value, divisor in izip(data, divisors)]

        # convert to a 1d array. We don't simply use data = np.array(data),
        # because if data is a list of ndarray (for example if we use
        # groupby(a, expr=id), *and* all the ndarrays have the same length,
        # the result is a 2d array instead of an array of ndarrays like we
        # need (at this point).
        arr = np.empty(len(data), dtype=type(data[0]))
        arr[:] = data
        data = arr

        # and reshape it
        data = data.reshape(len_pvalues)
        return LabeledArray(data, labels, possible_values,
                            row_totals, col_totals)
Exemple #4
0
    def evaluate(self, context):
        source_entity = context['__entity__']
        if self.entity_name is None:
            target_entity = source_entity
        else:
            target_entity = entity_registry[self.entity_name]

        if target_entity is source_entity:
            target_context = context
        else:
            target_context = EntityContext(target_entity,
                                           {'period': context['period']})

        ctx_filter = context.get('__filter__')

        if self.filter is not None and ctx_filter is not None:
            filter_expr = ctx_filter & self.filter
        elif self.filter is not None:
            filter_expr = self.filter
        elif ctx_filter is not None:
            filter_expr = ctx_filter
        else:
            filter_expr = None

        if filter_expr is not None:
            to_give_birth = expr_eval(filter_expr, context)
            num_birth = to_give_birth.sum()
        elif self.number is not None:
            to_give_birth = None
            num_birth = self.number
        else:
            raise Exception('no filter nor number in "new"')

        array = target_entity.array

        id_to_rownum = target_entity.id_to_rownum
        num_individuals = len(id_to_rownum)

        children = self._initial_values(array, to_give_birth, num_birth)
        # select real duplication case
        if self.num_duplicate is not None:
            number_rep = array[self.num_duplicate].compress( array[self.num_duplicate]>0 )
            children = children.repeat(number_rep,axis=0)
            num_birth = number_rep.sum()
            
        if self.expand==True:    
            from numpy.lib.stride_tricks import as_strided
        
            id_add = np.arange(number_rep.max())
            id_add = as_strided(id_add ,
                             shape=number_rep.shape + id_add.shape,
                             strides=(0,) + id_add.strides)
            id_add =  id_add[id_add < number_rep[:, None]]
            one_by_house = array['res'].compress( array[self.num_duplicate]>0 )  
#            indices = np.unique(one_by_house)
#            size_by_id = np.bincount(one_by_house) 
#            size_by_id = size_by_id.compress(size_by_id>0)
#            size_by_id = size_by_id.repeat(size_by_id)  
            id_ini = one_by_house.repeat(number_rep,axis=0)
            decalage = np.zeros(len(one_by_house),dtype=int)  
            indices = np.unique(one_by_house,return_index=True)[1]      
            decalage[indices[1:]] = number_rep[indices]
            decalage = decalage.cumsum().repeat(number_rep,axis=0)
#            decalage = decalage - decalage[0] 
            children['res'] = id_add+decalage+ array['res'].max()+1
            
        remember_id = children['id'].copy()
        
        if num_birth:
            children['id'] = np.arange(num_individuals,
                                       num_individuals + num_birth)
            children['period'] = context['period']

            used_variables = self._collect_kwargs_variables(context)
            child_context = context_subset(context, to_give_birth,
                                           used_variables)
            if to_give_birth is None:
                child_context = new_context_like(context, length=num_birth)
            else:
                child_context = context_subset(context, to_give_birth,
                                               used_variables)
            for k, v in self.kwargs.iteritems():
                children[k] = expr_eval(v, child_context) 
                       
        if self.numerotation is not None:
            from numpy.lib.stride_tricks import as_strided
            initial = np.zeros(len(array), dtype=bool)  
            id_dup = np.arange(number_rep.max())
            id_dup = as_strided(id_dup ,
                             shape=number_rep.shape + id_dup.shape,
                             strides=(0,) + id_dup.strides)
            id_dup =  id_dup[id_dup < number_rep[:, None]]  +1    
            children[self.numerotation] = id_dup

        add_individuals(target_context, children)

        # result is the ids of the new individuals corresponding to the source
        # entity
        # I change here to have the "father" name instead
        if to_give_birth is not None:
            if self.return_option is None:
                result = np.empty(context_length(context), dtype=int)
                result.fill(-1)
                # TODO: must change something to have father size correct with
                # target and not with source.
                if source_entity is target_entity:               
                    extra_bools = np.zeros(num_birth, dtype=bool)
                    to_give_birth = np.concatenate((to_give_birth, extra_bools))
                    
                # Note that np.place is a tad faster, but is currently buggy when
                # working with columns of structured arrays.
                # See http://projects.scipy.org/numpy/ticket/1869
                result[to_give_birth] = children['id']

                return result
            elif self.return_option=='father' :
                father = np.empty(context_length(context), dtype=int)
                father.fill(-1)  
                list_children = np.ones(num_birth, dtype=bool)
                initial = np.zeros(len(array), dtype=bool)
                birth = np.concatenate((initial, list_children))                              
                father[birth] = remember_id
                return father
        else:
            return None
Exemple #5
0
    def evaluate(self, context):
        global local_ctx

        ctx_filter = context.get('__filter__')

        id_to_rownum = context.id_to_rownum

        # at some point ctx_filter will be cached automatically, so we don't
        # need to take care of it manually here
        if ctx_filter is not None:
            set1filter = expr_eval(ctx_filter & self.set1filter, context)
            set2filter = expr_eval(ctx_filter & self.set2filter, context)
        else:
            set1filter = expr_eval(self.set1filter, context)
            set2filter = expr_eval(self.set2filter, context)

        score_expr = self.score_expr

        used_variables = score_expr.collect_variables(context)
        used_variables1 = [v for v in used_variables
                                    if not v.startswith('__other_')]
        used_variables2 = [v[8:] for v in used_variables
                                    if v.startswith('__other_')]

        set1 = context_subset(context, set1filter, ['id'] + used_variables1)
        set2 = context_subset(context, set2filter, ['id'] + used_variables2)
        set1len = set1filter.sum()
        set2len = set2filter.sum()
        tomatch = min(set1len, set2len)
        
        orderby = self.orderby
        if not isinstance(orderby, str):
            order = expr_eval(orderby, context)
        else: 
            order = np.zeros(context_length(context), dtype=int)
            if orderby == 'EDtM':
                for var in used_variables1:
                    order[set1filter] += (set1[var] -  set1[var].mean())**2/set1[var].var()
            if orderby == 'SDtOM':
                order_ctx = dict((k if k in used_variables1 else k, v)
                             for k, v in set1.iteritems())
                order_ctx.update(('__other_' + k, set2[k].mean()) for k in used_variables2)
                order[set1filter] = expr_eval(score_expr, order_ctx)               
        
        sorted_set1_indices = order[set1filter].argsort()[::-1]
        set1tomatch = sorted_set1_indices[:tomatch]
        print("matching with %d/%d individuals" % (set1len, set2len))

        #TODO: compute pk_names automatically: variables which are either
        # boolean, or have very few possible values and which are used more
        # than once in the expression and/or which are used in boolean
        # expressions
#        pk_names = ('eduach', 'work')
#        optimized_exprs = {}

        result = np.empty(context_length(context), dtype=int)
        result.fill(-1)

        local_ctx = dict(('__other_' + k if k in ['id'] + used_variables2 else k, v)
                         for k, v in set2.iteritems())

        if self.pool_size is None:
            #noinspection PyUnusedLocal
            def match_one_set1_individual(idx, sorted_idx):
                global local_ctx
    
                if not context_length(local_ctx):
                    raise StopIteration
    
                local_ctx.update((k, set1[k][sorted_idx]) for k in ['id'] + used_variables1)
    
    #            pk = tuple(individual1[fname] for fname in pk_names)
    #            optimized_expr = optimized_exprs.get(pk)
    #            if optimized_expr is None:
    #                for name in pk_names:
    #                    fake_set1['__f_%s' % name].value = individual1[name]
    #                optimized_expr = str(symbolic_expr.simplify())
    #                optimized_exprs[pk] = optimized_expr
    #            set2_scores = evaluate(optimized_expr, mm_dict, set2)
    
                set2_scores = expr_eval(score_expr, local_ctx)
    
                individual2_idx = np.argmax(set2_scores)
    
                id1 = local_ctx['id']
                id2 = local_ctx['__other_id'][individual2_idx]
    
                local_ctx = context_delete(local_ctx, individual2_idx)
    
                result[id_to_rownum[id1]] = id2
                result[id_to_rownum[id2]] = id1            
            
            loop_wh_progress(match_one_set1_individual, set1tomatch)
        else:
            pool_size = self.pool_size
            #noinspection PyUnusedLocal
            def match_one_set1_individual_pool(idx, sorted_idx, pool_size):
                global local_ctx
                
                set2_size = context_length(local_ctx)
                if not set2_size:
                    raise StopIteration
                
                if set2_size > pool_size:
                    pool = random.sample(xrange(context_length(local_ctx)), pool_size)
                else:
                    pool = range(set2_size)

                sub_local_ctx = context_subset(local_ctx, pool, None)
                sub_local_ctx.update((k, set1[k][sorted_idx]) for k in ['id'] + used_variables1)
                
                set2_scores = expr_eval(score_expr, sub_local_ctx)
    
                individual2_pool_idx = np.argmax(set2_scores)
                individual2_idx = pool[individual2_pool_idx]
                
                id1 = sub_local_ctx['id']
                id2 = local_ctx['__other_id'][individual2_idx]
    
                local_ctx = context_delete(local_ctx, individual2_idx)
    
                result[id_to_rownum[id1]] = id2
                result[id_to_rownum[id2]] = id1
                
            loop_wh_progress(match_one_set1_individual_pool, set1tomatch, pool_size=10)
            
        return result
Exemple #6
0
    def evaluate(self, context):
        global local_ctx
        global cost

        ctx_filter = context.get('__filter__')

        id_to_rownum = context.id_to_rownum

        # at some point ctx_filter will be cached automatically, so we don't
        # need to take care of it manually here
        if ctx_filter is not None:
            set1filter = expr_eval(ctx_filter & self.set1filter, context)
            set2filter = expr_eval(ctx_filter & self.set2filter, context)
        else:
            set1filter = expr_eval(self.set1filter, context)
            set2filter = expr_eval(self.set2filter, context)

        score_expr = self.score_expr

        used_variables = score_expr.collect_variables(context)
        used_variables1 = ['id'] + [v for v in used_variables
                                    if not v.startswith('__other_')]
        used_variables2 = ['id'] + [v[8:] for v in used_variables
                                    if v.startswith('__other_')]

        set1 = context_subset(context, set1filter, used_variables1)
        set2 = context_subset(context, set2filter, used_variables2)
        orderby = expr_eval(self.orderby, context)
        sorted_set1_indices = orderby[set1filter].argsort()[::-1]
        print "matching with %d/%d individuals" % (set1filter.sum(),
                                                   set2filter.sum())

        #TODO: compute pk_names automatically: variables which are either
        # boolean, or have very few possible values and which are used more
        # than once in the expression and/or which are used in boolean
        # expressions
#        pk_names = ('eduach', 'work')
#        optimized_exprs = {}

        result = np.empty(context_length(context), dtype=int)
        result.fill(-1)

        local_ctx = dict(('__other_' + k if k in used_variables2 else k, v)
                         for k, v in set2.iteritems())
#        print local_ctx
#        test=local_ctx.copy()
#        test.update((k, set1[k]) for k in used_variables1)
#
#

        
######## Tentative de Munkres
        
        if self.option == "optimal": 
            cost = []
            def create_cost(idx, sorted_idx):
    
                global cost
                if not context_length(local_ctx):
                    raise StopIteration
                local_ctx.update((k, set1[k][sorted_idx]) for k in used_variables1)
    
                set2_scores = expr_eval(score_expr, local_ctx)
                cost.append(set2_scores[:].tolist())
                
            loop_wh_progress(create_cost, sorted_set1_indices)       
            resultat = MunkresX.maxWeightMatching(cost)
            for id1,id2 in resultat.items(): 
                result[id_to_rownum[id1]] = id2
                result[id_to_rownum[id2]] = id1    
            return result
        
        else : 
            def match_one_set1_individual(idx, sorted_idx):
                global local_ctx   
                if not context_length(local_ctx):
                    raise StopIteration    
                local_ctx.update((k, set1[k][sorted_idx]) for k in used_variables1)
                set2_scores = expr_eval(score_expr, local_ctx)
    #            print set2_scores
                individual2_idx = np.argmax(set2_scores)   
                id1 = local_ctx['id']
                id2 = local_ctx['__other_id'][individual2_idx]    
                local_ctx = context_delete(local_ctx, individual2_idx)
    
                result[id_to_rownum[id1]] = id2
                result[id_to_rownum[id2]] = id1
    
            loop_wh_progress(match_one_set1_individual, sorted_set1_indices)       
            return result
Exemple #7
0
        def match_cell(idx, sorted_idx, pool_size):
            global matching_ctx

            set2_size = context_length(matching_ctx)
            if not set2_size:
                raise StopIteration

            if pool_size is not None and set2_size > pool_size:
                pool = random.sample(xrange(set2_size), pool_size)
                local_ctx = context_subset(matching_ctx, pool)
            else:
                local_ctx = matching_ctx.copy()

            local_ctx.update((k, set1[k][sorted_idx])
                             for k in {'__ids__'} | used_variables1)

            eval_ctx = context.clone(entity_data=local_ctx)
            set2_scores = expr_eval(score, eval_ctx)
            cell2_idx = set2_scores.argmax()

            cell1ids = local_ctx['__ids__']
            cell2ids = local_ctx['__other___ids__'][cell2_idx]

            if pool_size is not None and set2_size > pool_size:
                # transform pool-local index to set/matching_ctx index
                cell2_idx = pool[cell2_idx]

            cell1size = len(cell1ids)
            cell2size = len(cell2ids)
            nb_match = min(cell1size, cell2size)

            # we could introduce a random choice here but it is not
            # much necessary. In that case, it should be done in group_context
            ids1 = cell1ids[:nb_match]
            ids2 = cell2ids[:nb_match]

            result[id_to_rownum[ids1]] = ids2
            result[id_to_rownum[ids2]] = ids1
            
            if nb_match == cell2size:
                matching_ctx = context_delete(matching_ctx, cell2_idx)
            else:
                # other variables do not need to be modified since the cell
                # only got smaller and was not deleted
                matching_ctx['__other___ids__'][cell2_idx] = cell2ids[nb_match:]

            # FIXME: the expr gets cached for the full matching_ctx at the
            # beginning and then when another women with the same values is
            # found, it thinks it can reuse the expr but it breaks because it
            # has not the correct length.

            # the current workaround is to invalidate the whole cache for the
            # current entity but this is not the right way to go.
            # * disable the cache for matching?
            # * use a local cache so that methods after matching() can use
            # what was in the cache before matching(). Shouldn't the cache be
            # stored inside the context anyway?
            expr_cache.invalidate(context.period, context.entity_name)

            if nb_match < cell1size:
                set1['__ids__'][sorted_idx] = cell1ids[nb_match:]
                match_cell(idx, sorted_idx, pool_size)
Exemple #8
0
    def evaluate(self, context):
        global matching_ctx

        ctx_filter = context.get('__filter__')

        id_to_rownum = context.id_to_rownum

        # at some point ctx_filter will be cached automatically, so we don't
        # need to take care of it manually here
        if ctx_filter is not None:
            set1filter = expr_eval(ctx_filter & self.set1filter, context)
            set2filter = expr_eval(ctx_filter & self.set2filter, context)
        else:
            set1filter = expr_eval(self.set1filter, context)
            set2filter = expr_eval(self.set2filter, context)

        score_expr = self.score_expr

        used_variables = score_expr.collect_variables(context)
        used_variables1 = ['id'] + [v for v in used_variables
                                    if not v.startswith('__other_')]
        used_variables2 = ['id'] + [v[8:] for v in used_variables
                                    if v.startswith('__other_')]

        #TODO: we should detect whether or not we are using non-simple
        # expressions (EvaluableExpression children) and pre-evaluate them,
        # because otherwise they are re-evaluated on all of set2 for each
        # individual in set1. See https://github.com/liam2/liam2/issues/128
        set1 = context_subset(context, set1filter, used_variables1)
        set2 = context_subset(context, set2filter, used_variables2)
        orderby = expr_eval(self.orderby, context)
        set1len = set1filter.sum()
        set2len = set2filter.sum()
        tomatch = min(set1len, set2len)
        sorted_set1_indices = orderby[set1filter].argsort()[::-1]
        set1tomatch = sorted_set1_indices[:tomatch]
        print("matching with %d/%d individuals" % (set1len, set2len))

        #TODO: compute pk_names automatically: variables which are either
        # boolean, or have very few possible values and which are used more
        # than once in the expression and/or which are used in boolean
        # expressions
#        pk_names = ('eduach', 'work')
#        optimized_exprs = {}

        result = np.empty(context_length(context), dtype=int)
        result.fill(-1)

        matching_ctx = dict(('__other_' + k if k in used_variables2 else k, v)
                            for k, v in set2.iteritems())

        #noinspection PyUnusedLocal
        def match_one_set1_individual(idx, sorted_idx):
            global matching_ctx

            if not context_length(matching_ctx):
                raise StopIteration

            local_ctx = matching_ctx.copy()
            local_ctx.update((k, set1[k][sorted_idx]) for k in used_variables1)
#            pk = tuple(individual1[fname] for fname in pk_names)
#            optimized_expr = optimized_exprs.get(pk)
#            if optimized_expr is None:
#                for name in pk_names:
#                    fake_set1['__f_%s' % name].value = individual1[name]
#                optimized_expr = str(symbolic_expr.simplify())
#                optimized_exprs[pk] = optimized_expr
#            set2_scores = evaluate(optimized_expr, mm_dict, set2)
            set2_scores = expr_eval(score_expr, local_ctx)

            individual2_idx = np.argmax(set2_scores)

            id1 = local_ctx['id']
            id2 = matching_ctx['__other_id'][individual2_idx]
            matching_ctx = context_delete(matching_ctx, individual2_idx)

            result[id_to_rownum[id1]] = id2
            result[id_to_rownum[id2]] = id1

        loop_wh_progress(match_one_set1_individual, set1tomatch,
                         title="Matching...")
        return result
Exemple #9
0
    def evaluate(self, context):
        source_entity = context['__entity__']
        if self.entity_name is None:
            target_entity = source_entity
        else:
            target_entity = entity_registry[self.entity_name]

        if target_entity is source_entity:
            target_context = context
        else:
            target_context = \
                EntityContext(target_entity,
                              {'period': context['period'],
                               '__globals__': context['__globals__']})
        ctx_filter = context.get('__filter__')

        if self.filter is not None and ctx_filter is not None:
            filter_expr = ctx_filter & self.filter
        elif self.filter is not None:
            filter_expr = self.filter
        elif ctx_filter is not None:
            filter_expr = ctx_filter
        else:
            filter_expr = None

        if filter_expr is not None:
            to_give_birth = expr_eval(filter_expr, context)
            num_birth = to_give_birth.sum()
        elif self.number is not None:
            to_give_birth = None
            num_birth = self.number
        else:
            raise Exception('no filter nor number in "new"')

        array = target_entity.array

        id_to_rownum = target_entity.id_to_rownum
        num_individuals = len(id_to_rownum)

        children = self._initial_values(array, to_give_birth, num_birth)
        if num_birth:
            children['id'] = np.arange(num_individuals,
                                       num_individuals + num_birth)
            children['period'] = context['period']

            used_variables = self._collect_kwargs_variables(context)
            if to_give_birth is None:
                child_context = new_context_like(context, length=num_birth)
            else:
                child_context = context_subset(context, to_give_birth,
                                               used_variables)
            for k, v in self.kwargs.iteritems():
                children[k] = expr_eval(v, child_context)

        add_individuals(target_context, children)

        # result is the ids of the new individuals corresponding to the source
        # entity
        if to_give_birth is not None:
            result = np.empty(context_length(context), dtype=int)
            result.fill(-1)
            if source_entity is target_entity:
                extra_bools = np.zeros(num_birth, dtype=bool)
                to_give_birth = np.concatenate((to_give_birth, extra_bools))
            # Note that np.place is a bit faster, but is currently buggy when
            # working with columns of structured arrays.
            # See http://projects.scipy.org/numpy/ticket/1869
            result[to_give_birth] = children['id']
            return result
        else:
            return None
Exemple #10
0
    def evaluate(self, context):
        expressions = self.expressions
        columns = [expr_eval(e, context) for e in expressions]
        if self.filter is not None:
            filter_value = expr_eval(self.filter, context)
            # TODO: make a function out of this, I think we have this pattern
            # in several places
            filtered_columns = [
                col[filter_value] if isinstance(col, np.ndarray) and col.shape else [col] for col in columns
            ]
        else:
            filtered_columns = columns

        possible_values = [np.unique(col) for col in filtered_columns]
        groups = partition_nd(filtered_columns, True, possible_values)

        # TODO: use group_indices_nd directly to avoid using np.unique
        # this is twice as fast (unique is very slow) but breaks because
        # the rest of the code assumes all combinations are present
        #        if self.filter is not None:
        #            filter_value = expr_eval(self.filter, context)
        #        else:
        #            filter_value = True
        #
        #        d = group_indices_nd(columns, filter_value)
        #        pvalues = sorted(d.keys())
        #        ndim = len(columns)
        #        possible_values = [[pv[i] for pv in pvalues]
        #                           for i in range(ndim)]
        #        groups = [d[k] for k in pvalues]

        # groups is a (flat) list of list.
        # the first variable is the outer-most "loop",
        # the last one the inner most.

        # add total for each row
        folded_exprs = len(expressions) - 1
        len_pvalues = [len(vals) for vals in possible_values]
        width = len_pvalues[-1]
        height = prod(len_pvalues[:-1])

        def xy_to_idx(x, y):
            # divide by the prod of possible values of expressions to its
            # right, mod by its own number of possible values
            offsets = [(y / prod(len_pvalues[v + 1 : folded_exprs])) % len_pvalues[v] for v in range(folded_exprs)]
            return sum(v * prod(len_pvalues[i + 1 :]) for i, v in enumerate(offsets)) + x

        groups_wh_totals = []
        for y in range(height):
            line_indices = []
            for x in range(width):
                member_indices = groups[xy_to_idx(x, y)]
                groups_wh_totals.append(member_indices)
                line_indices.extend(member_indices)
            groups_wh_totals.append(line_indices)

        # width just increased because of totals
        width += 1

        # add total for each column (including the "total per row" one)
        for x in range(width):
            column_indices = []
            for y in range(height):
                column_indices.extend(groups_wh_totals[y * width + x])
            groups_wh_totals.append(column_indices)

        # evaluate the expression on each group
        expr = self.expr
        used_variables = expr.collect_variables(context)
        used_variables.add("id")

        data = []
        for member_indices in groups_wh_totals:
            local_context = context_subset(context, member_indices, used_variables)
            data.append(expr_eval(expr, local_context))

        if self.percent:
            # convert to np.float64 to get +-inf if total_value is int(0)
            # instead of Python's built-in behavior of raising an exception.
            # This can happen at least when using the default expr (grpcount())
            # and the filter yields empty groups
            total_value = np.float64(data[-1])
            data = [100.0 * value / total_value for value in data]

        #        if self.by or self.percent:
        #            if self.percent:
        #                total_value = data[-1]
        #                divisors = [total_value for _ in data]
        #            else:
        #                num_by = len(self.by)
        #                inc = prod(len_pvalues[-num_by:])
        #                num_groups = len(groups)
        #                num_categories = prod(len_pvalues[:-num_by])
        #
        #                categories_groups_idx = [range(cat_idx, num_groups, inc)
        #                                         for cat_idx in range(num_categories)]
        #
        #                divisors = ...
        #
        #            data = [100.0 * value / divisor
        #                    for value, divisor in izip(data, divisors)]

        # gender | False | True | total
        #        |    20 |   16 |    35

        # gender | False | True |
        #   dead |       |      | total
        #  False |    20 |   15 |    35
        #   True |     0 |    1 |     1
        #  total |    20 |   16 |    36

        #          |   dead | False | True |
        # agegroup | gender |       |      | total
        #        5 |  False |    20 |   15 |    xx
        #        5 |   True |     0 |    1 |    xx
        #       10 |  False |    25 |   10 |    xx
        #       10 |   True |     1 |    1 |    xx
        #          |  total |    xx |   xx |    xx

        # add headers
        labels = [str(e) for e in expressions]
        if folded_exprs:
            result = [
                [""] * (folded_exprs - 1) + [labels[-1]] + list(possible_values[-1]) + [""],
                # 2nd line
                labels[:-1] + [""] * len(possible_values[-1]) + ["total"],
            ]
            categ_values = list(product(*possible_values[:-1]))
            last_line = [""] * (folded_exprs - 1) + ["total"]
            categ_values.append(last_line)
            height += 1
        else:
            # if there is only one expression, the headers are different
            result = [[labels[-1]] + list(possible_values[-1]) + ["total"]]
            categ_values = [[""]]

        for y in range(height):
            result.append(list(categ_values[y]) + data[y * width : (y + 1) * width])

        return PrettyTable(result)