Beispiel #1
0
def build_erf_aggregates(variables = None, year = 2006, unit = 1e6):
    """
    Fetch the relevant aggregates from erf data
    """

    erf = DataCollection(year=year)
    if variables is not None and "wprm" not in variables:
        variables.append("wprm")
    print 'Fetching aggregates from erf %s data' %str(year)
    df = erf.get_of_values(variables=variables, table = "erf_menage")

    of2erf = get_of2erf()
    erf2of = get_erf2of()

    df.rename(columns = erf2of, inplace = True)
    wprm = df["wprm"]
    for col in df.columns:
        try:
            df[col] = df[col].astype(np.float64)
        except:
            pass
    df = df.mul(wprm, axis = 0)
    for col in list(set(df.columns) - set(['ident', 'wprm'])):
        try:
            df[col] = df[col].sum()/1e6
        except:
            pass

    return df.ix[0:1] # Aggregate so we only need 1 row
Beispiel #2
0
def build_erf_aggregates(variables=None, year=2006, unit=1e6):
    """
    Fetch the relevant aggregates from erf data
    """

    erf = DataCollection(year=year)
    if variables is not None and "wprm" not in variables:
        variables.append("wprm")
    print 'Fetching aggregates from erf %s data' % str(year)
    df = erf.get_of_values(variables=variables, table="erf_menage")

    of2erf = get_of2erf()
    erf2of = get_erf2of()

    df.rename(columns=erf2of, inplace=True)
    wprm = df["wprm"]
    for col in df.columns:
        try:
            df[col] = df[col].astype(np.float64)
        except:
            pass
    df = df.mul(wprm, axis=0)
    for col in list(set(df.columns) - set(['ident', 'wprm'])):
        try:
            df[col] = df[col].sum() / 1e6
        except:
            pass

    return df.ix[0:1]  # Aggregate so we only need 1 row
Beispiel #3
0
    def preproc(self):

        erf_menage = self.erf_menage
        erf_eec_indivi = self.erf_eec_indivi
        simu_aggr_tables = self.simu_aggr_tables
        simu_nonaggr_tables = self.simu_nonaggr_tables

        def get_all_ancestors(varlist):
            if len(varlist) == 0:
                return []
            else:
                if varlist[0]._parents == set():
                    return ([varlist[0]] + get_all_ancestors(varlist[1:]))
                else:
                    return ([varlist[0]] +
                            get_all_ancestors(list(varlist[0]._parents)) +
                            get_all_ancestors(varlist[1:]))

        # We want to get all ancestors + children + the options that we're going to encounter
        parents = map(
            lambda x: self.simulation.output_table.column_by_name.get(x),
            [self.variable])
        parents = get_all_ancestors(parents)
        options = []
        for varcol in parents:
            options.extend(varcol._option.keys())
        options = list(set(options))
        #print options
        parents = map(lambda x: x.name, parents)
        for var in [self.variable]:
            children = set()
            varcol = self.simulation.output_table.column_by_name.get(var)
            children = children.union(
                set(map(lambda x: x.name, varcol._children)))
        variables = list(set(parents + list(children)))
        #print variables
        del parents, children
        gc.collect()

        def get_var(variable):
            variables = [variable]
            return self.simulation.aggregated_by_entity(entity="men",
                                                        variables=variables,
                                                        all_output_vars=False,
                                                        force_sum=True)[0]

        simu_aggr_tables = get_var(variables[0])
        for var in variables[1:]:
            simu_aggr_tables = simu_aggr_tables.merge(
                get_var(var)[['idmen', var]], on='idmen', how='outer')
        # We load the data from erf table in case we have to pick data there
        erf_data = DataCollection(year=self.simulation.datesim.year)
        os.system('cls')
        todo = set(variables + ["ident", "wprm"]).union(set(options))
        print 'Variables or equivalents to fetch :'
        print todo
        '''
        Méthode générale pour aller chercher les variables de l'erf/eec
        ( qui n'ont pas forcément le même nom
        et parfois sont les variables utilisées pour créér l'of ):
        1 - essayer le get_of2erf, ça doit marcher pour les variables principales ( au moins les aggrégats
        que l'on compare )
        Si les variables ne sont pas directement dans la table,
        elles ont été calculées à partir d'autres variables de données erf/eec
        donc chercher dans :
        2 - build_survey
        3 - model/model.py qui dira éventuellement dans quel module de model/ chercher
        Le 'print todo' vous indique quelles variables chercher
        ( attention à ne pas inclure les enfants directs )
        L'utilisation du Ctrl-H est profitable !
        '''

        fetch_eec = [
            'statut', 'titc', 'chpub', 'encadr', 'prosa', 'age', 'naim',
            'naia', 'noindiv'
        ]
        fetch_erf = [
            'zsali', 'af', 'ident', 'wprm', 'noi', 'noindiv', 'quelfic'
        ]
        erf_df = erf_data.get_of_values(variables=fetch_erf,
                                        table="erf_indivi")
        eec_df = erf_data.get_of_values(variables=fetch_eec,
                                        table="eec_indivi")
        erf_eec_indivi = erf_df.merge(eec_df, on='noindiv', how='inner')
        assert 'quelfic' in erf_eec_indivi.columns, "quelfic not in erf_indivi columns"
        del eec_df, erf_df

        # We then get the aggregate variables for the menage ( mainly to compare with of )
        print 'Loading data from erf_menage table'
        erf_menage = erf_data.get_of_values(variables=list(todo) + ['quelfic'],
                                            table="erf_menage")

        del todo
        gc.collect()
        assert 'ident' in erf_menage.columns, "ident not in erf_menage.columns"

        from openfisca_france.data.erf import get_erf2of
        erf2of = get_erf2of()
        erf_menage.rename(columns=erf2of, inplace=True)

        # We get the options from the simulation non aggregated tables:

        # First from the output_table
        # We recreate the noindiv in output_table
        self.simulation.output_table.table[
            'noindiv'] = 100 * self.simulation.output_table.table.idmen_ind + self.simulation.output_table.table.noi_ind
        self.simulation.output_table.table[
            'noindiv'] = self.simulation.output_table.table['noindiv'].astype(
                np.int64)
        s1 = [
            var for var in set(options).intersection(
                set(self.simulation.output_table.table.columns))
        ] + ['idmen_ind', 'quimen_ind', 'noindiv']
        simu_nonaggr_tables = (self.simulation.output_table.table)[s1]
        simu_nonaggr_tables.rename(columns={
            'idmen_ind': 'idmen',
            'quimen_ind': 'quimen'
        },
                                   inplace=True)
        assert 'noindiv' in simu_nonaggr_tables.columns

        # If not found, we dwelve into the input_table
        if (set(s1) -
                set(['idmen_ind', 'quimen_ind', 'noindiv'])) < set(options):
            assert 'noindiv' in self.simulation.input_table.table.columns, "'noindiv' not in simulation.input_table.table.columns"
            s2 = [
                var for var in (set(options).intersection(
                    set(self.simulation.input_table.table.columns)) - set(s1))
            ] + ['noindiv']
            #print s2
            temp = self.simulation.input_table.table[s2]
            simu_nonaggr_tables = simu_nonaggr_tables.merge(temp,
                                                            on='noindiv',
                                                            how='inner',
                                                            sort=False)

            del s2, temp
        del s1
        gc.collect()

        simu_nonaggr_tables = simu_nonaggr_tables[
            list(set(options)) + ['idmen', 'quimen', 'noindiv']]
        #print options, variables
        assert 'idmen' in simu_nonaggr_tables.columns, 'Idmen not in simu_nonaggr_tables columns'

        # Check the idmens that are not common
        erf_menage.rename(columns={'ident': 'idmen'}, inplace=True)

        print "\n"
        print 'Checking if idmen is here...'
        print '\n ERF : '
        print 'idmen' in erf_menage.columns
        print "\n Simulation output"
        print 'idmen' in simu_aggr_tables.columns
        print "\n"

        #print 'Dropping duplicates of idmen for both tables...'
        assert not erf_menage["idmen"].duplicated().any(
        ), "Duplicated idmen in erf_menage"
        #erf_menage.drop_duplicates('idmen', inplace = True)
        simu_aggr_tables.drop_duplicates('idmen', inplace=True)
        assert not simu_aggr_tables["idmen"].duplicated().any(
        ), "Duplicated idmen in of"

        print 'Checking mismatching idmen... '
        s1 = set(erf_menage['idmen']) - (set(simu_aggr_tables['idmen']))
        if s1:
            print "idmen that aren't in simu_aggr_tables : %s" % str(len(s1))
            pass
        s2 = (set(simu_aggr_tables['idmen'])) - set(erf_menage['idmen'])
        if s2:
            print "idmen that aren't in erf_menage : %s" % str(len(s2))
            pass
        del s1, s2

        # Restrict to common idmens and merge
        s3 = set(erf_menage['idmen']).intersection(
            set(simu_aggr_tables['idmen']))
        print "Restricting to %s common idmen... \n" % str(len(s3))
        erf_menage = erf_menage[erf_menage['idmen'].isin(s3)]
        simu_aggr_tables = simu_aggr_tables[simu_aggr_tables['idmen'].isin(s3)]
        del s3
        gc.collect()

        #print erf_menage.columns
        #print simu_aggr_tables.columns

        # Compare differences across of and erf dataframes
        print "Comparing differences between dataframes... \n"
        colcom = (set(erf_menage.columns).intersection(
            set(simu_aggr_tables.columns))) - set(['idmen', 'wprm'])
        print 'Common variables: '
        print colcom
        erf_menage.reset_index(inplace=True)
        simu_aggr_tables.reset_index(inplace=True)
        for col in colcom:
            temp = set(
                erf_menage['idmen'][erf_menage[col] != simu_aggr_tables[col]])
            print "Numbers of idmen that aren't equal on variable %s : %s \n" % (
                col, str(len(temp)))
            del temp

        self.erf_menage = erf_menage
        self.erf_eec_indivi = erf_eec_indivi
        self.simu_aggr_tables = simu_aggr_tables
        self.simu_nonaggr_tables = simu_nonaggr_tables
Beispiel #4
0
def test(year=2006, variables=['af']):
    simulation = SurveySimulation()
    survey_filename = os.path.join(model.DATA_DIR, 'sources', 'test.h5')
    simulation.set_config(year=year, survey_filename=survey_filename)
    simulation.set_param()
    simulation.compute()

    #     of_aggregates = Aggregates()
    #     of_aggregates.set_simulation(simulation)
    #     of_aggregates.compute()
    #     print of_aggregates.aggr_frame
    #
    #     from openfisca_france.data.erf.aggregates import build_erf_aggregates
    #     temp = (build_erf_aggregates(variables=variables, year= year))
    #     print temp
    #     return
    variable = "af"
    debugger = Debugger()
    debugger.set_simulation(simulation)
    debugger.set_variable(variable)
    debugger.show_aggregates()

    def get_all_ancestors(varlist):
        if len(varlist) == 0:
            return []
        else:
            if varlist[0]._parents == set():
                return ([varlist[0]] + get_all_ancestors(varlist[1:]))
            else:
                return ([varlist[0]] +
                        get_all_ancestors(list(varlist[0]._parents)) +
                        get_all_ancestors(varlist[1:]))

    # We want to get all ancestors + children + the options that we're going to encounter
    parents = map(lambda x: simulation.output_table.column_by_name.get(x),
                  variables)
    parents = get_all_ancestors(parents)
    options = []
    for varcol in parents:
        options.extend(varcol._option.keys())
    options = list(set(options))
    #print options
    parents = map(lambda x: x.name, parents)
    for var in variables:
        children = set()
        varcol = simulation.output_table.column_by_name.get(var)
        children = children.union(set(map(lambda x: x.name, varcol._children)))
    variables = list(set(parents + list(children)))
    #print variables
    del parents, children
    gc.collect()

    def get_var(variable):
        variables = [variable]
        return simulation.aggregated_by_entity(entity="men",
                                               variables=variables,
                                               all_output_vars=False,
                                               force_sum=True)[0]

    simu_aggr_tables = get_var(variables[0])
    for var in variables[1:]:
        simu_aggr_tables = simu_aggr_tables.merge(get_var(var)[['idmen', var]],
                                                  on='idmen',
                                                  how='outer')

    # We load the data from erf table in case we have to pick data there
    erf_data = DataCollection(year=year)
    os.system('cls')
    todo = set(variables + ["ident", "wprm"]).union(set(options))
    print 'Variables or equivalents to fetch :'
    print todo
    '''
    Méthode générale pour aller chercher les variables de l'erf/eec
    ( qui n'ont pas forcément le même nom
    et parfois sont les variables utilisées pour créér l'of ):
    1 - essayer le get_of2erf, ça doit marcher pour les variables principales ( au moins les aggrégats
    que l'on compare )
    Si les variables ne sont pas directement dans la table,
    elles ont été calculées à partir d'autres variables de données erf/eec
    donc chercher dans :
    2 - build_survey
    3 - model/model.py qui dira éventuellement dans quel module de model/ chercher
    Le 'print todo' vous indique quelles variables chercher
    ( attention à ne pas inclure les enfants directs )
    L'utilisation du Ctrl-H est profitable !
    '''

    fetch_eec = [
        'statut', 'titc', 'chpub', 'encadr', 'prosa', 'age', 'naim', 'naia',
        'noindiv'
    ]
    fetch_erf = ['zsali', 'af', 'ident', 'wprm', 'noi', 'noindiv', 'quelfic']
    erf_df = erf_data.get_of_values(variables=fetch_erf, table="erf_indivi")
    eec_df = erf_data.get_of_values(variables=fetch_eec, table="eec_indivi")
    erf_eec_indivi = erf_df.merge(eec_df, on='noindiv', how='inner')
    assert 'quelfic' in erf_eec_indivi.columns, "quelfic not in erf_indivi columns"
    del eec_df, erf_df

    # We then get the aggregate variables for the menage ( mainly to compare with of )
    print 'Loading data from erf_menage table'
    erf_menage = erf_data.get_of_values(variables=list(todo) + ['quelfic'],
                                        table="erf_menage")

    del todo
    gc.collect()
    assert 'ident' in erf_menage.columns, "ident not in erf_menage.columns"

    from openfisca_france.data.erf import get_erf2of
    erf2of = get_erf2of()
    erf_menage.rename(columns=erf2of, inplace=True)

    # We get the options from the simulation non aggregated tables:

    # First from the output_table
    # We recreate the noindiv in output_table
    simulation.output_table.table[
        'noindiv'] = 100 * simulation.output_table.table.idmen_ind + simulation.output_table.table.noi_ind
    simulation.output_table.table['noindiv'] = simulation.output_table.table[
        'noindiv'].astype(np.int64)
    s1 = [
        var for var in set(options).intersection(
            set(simulation.output_table.table.columns))
    ] + ['idmen_ind', 'quimen_ind', 'noindiv']
    simu_nonaggr_tables = (simulation.output_table.table)[s1]
    simu_nonaggr_tables.rename(columns={
        'idmen_ind': 'idmen',
        'quimen_ind': 'quimen'
    },
                               inplace=True)
    assert 'noindiv' in simu_nonaggr_tables.columns

    # If not found, we dwelve into the input_table
    if (set(s1) - set(['idmen_ind', 'quimen_ind', 'noindiv'])) < set(options):
        assert 'noindiv' in simulation.input_table.table.columns, "'noindiv' not in simulation.input_table.table.columns"
        s2 = [
            var for var in (set(options).intersection(
                set(simulation.input_table.table.columns)) - set(s1))
        ] + ['noindiv']
        #print s2
        temp = simulation.input_table.table[s2]
        simu_nonaggr_tables = simu_nonaggr_tables.merge(temp,
                                                        on='noindiv',
                                                        how='inner',
                                                        sort=False)

        del s2, temp
    del s1
    gc.collect()

    simu_nonaggr_tables = simu_nonaggr_tables[list(set(options)) +
                                              ['idmen', 'quimen', 'noindiv']]
    #print options, variables
    assert 'idmen' in simu_nonaggr_tables.columns, 'Idmen not in simu_nonaggr_tables columns'

    # Check the idmens that are not common
    erf_menage.rename(columns={'ident': 'idmen'}, inplace=True)

    print "\n"
    print 'Checking if idmen is here...'
    print '\n ERF : '
    print 'idmen' in erf_menage.columns
    print "\n Simulation output"
    print 'idmen' in simu_aggr_tables.columns
    print "\n"

    #print 'Dropping duplicates of idmen for both tables...'
    assert not erf_menage["idmen"].duplicated().any(
    ), "Duplicated idmen in erf_menage"
    #erf_menage.drop_duplicates('idmen', inplace = True)
    simu_aggr_tables.drop_duplicates('idmen', inplace=True)
    assert not simu_aggr_tables["idmen"].duplicated().any(
    ), "Duplicated idmen in of"

    print 'Checking mismatching idmen... '
    s1 = set(erf_menage['idmen']) - (set(simu_aggr_tables['idmen']))
    if s1:
        print "idmen that aren't in simu_aggr_tables : %s" % str(len(s1))
        pass
    s2 = (set(simu_aggr_tables['idmen'])) - set(erf_menage['idmen'])
    if s2:
        print "idmen that aren't in erf_menage : %s" % str(len(s2))
        pass
    del s1, s2

    # Restrict to common idmens and merge
    s3 = set(erf_menage['idmen']).intersection(set(simu_aggr_tables['idmen']))
    print "Restricting to %s common idmen... \n" % str(len(s3))
    erf_menage = erf_menage[erf_menage['idmen'].isin(s3)]
    simu_aggr_tables = simu_aggr_tables[simu_aggr_tables['idmen'].isin(s3)]
    del s3
    gc.collect()

    #print erf_menage.columns
    #print simu_aggr_tables.columns

    # Compare differences across of and erf dataframes
    print "Comparing differences between dataframes... \n"
    colcom = (set(erf_menage.columns).intersection(
        set(simu_aggr_tables.columns))) - set(['idmen', 'wprm'])
    print 'Common variables: '
    print colcom
    erf_menage.reset_index(inplace=True)
    simu_aggr_tables.reset_index(inplace=True)
    for col in colcom:
        temp = set(
            erf_menage['idmen'][erf_menage[col] != simu_aggr_tables[col]])
        print "Numbers of idmen that aren't equal on variable %s : %s \n" % (
            col, str(len(temp)))
        del temp

    # Detect the biggest differences
    bigtable = merge(erf_menage,
                     simu_aggr_tables,
                     on='idmen',
                     how='inner',
                     suffixes=('_erf', '_of'))
    print 'Length of new dataframe is %s' % str(len(bigtable))
    #print bigtable.columns
    bigtable.set_index('idmen', drop=False, inplace=True)

    already_met = []
    options_met = []

    for col in colcom:
        bigtemp = None
        table = bigtable[and_(bigtable[col + '_erf'] != 0,
                              bigtable[col + '_of'] != 0)]
        table[col] = (table[col + '_erf'] - table[col + '_of']
                      ) / table[col + '_erf']  #Difference relative
        table[col] = table[col].apply(lambda x: abs(x))
        print 'Minimum difference between the two tables for %s is %s' % (
            col, str(table[col].min()))
        print 'Maximum difference between the two tables for %s is %s' % (
            col, str(table[col].max()))
        print table[col].describe()
        try:
            assert len(table[col]) == len(table['wprm_of']), "PINAGS"
            dec, values = mwp(table[col],
                              np.arange(1, 11),
                              table['wprm_of'],
                              2,
                              return_quantiles=True)
            #print sorted(values)
            dec, values = mwp(table[col],
                              np.arange(1, 101),
                              table['wprm_erf'],
                              2,
                              return_quantiles=True)
            #print sorted(values)[90:]
            del dec, values
            gc.collect()
        except:
            #print 'Weighted percentile method didnt work for %s' %col
            pass
        print "\n"

        # Show the relevant information for the most deviant households
        table.sort(columns=col, ascending=False, inplace=True)
        #print table[col][0:10].to_string()
        if bigtemp is None:
            bigtemp = {
                'table': table[[col, col + '_of', col + '_erf',
                                'idmen']][0:10],
                'options': None
            }
        bigtemp['table'][col + 'div'] = bigtemp['table'][
            col + '_of'] / bigtemp['table'][col + '_erf']
        print bigtemp['table'].to_string()
        '''
        bigtemp is the table which will get filled little by little by the relevant variables.
        Up to the last rows of code 'table' refers to a table of aggregated values,
        while 'options is a table of individual variables.
        The reason we call it in a dictionnary is also because we modify it inside the recursive function 'iter_on parents',
        and it causes an error in Python unless for certain types like dictionnary values.
        '''
        #print "\n"

        # If variable is a Prestation, we show the dependancies
        varcol = simulation.output_table.column_by_name.get(col)
        if isinstance(varcol, Prestation):
            '''
            For the direct children
            '''
            if not varcol._children is None:
                ch_to_fetch = list(varcol._children)
                ch_to_fetch = map(lambda x: x.name, ch_to_fetch)
                ch_fetched = []

                if set(ch_to_fetch) <= set(simu_aggr_tables.columns):
                    print "Variables which need %s to be computed :\n %s \n" % (
                        col, str(ch_to_fetch))
                    for var in ch_to_fetch:
                        if var + '_of' in table.columns:
                            ch_fetched.append(var + '_of')
                        else:
                            ch_fetched.append(var)
                elif set(ch_to_fetch) <= set(simu_aggr_tables.columns).union(
                        erf_menage.columns):
                    print "Variables which need %s to be computed (some missing picked in erf):\n %s \n" % (
                        col, str(ch_to_fetch))
                    for var in ch_to_fetch:
                        if var in simu_aggr_tables.columns:
                            if var + '_of' in table.columns:
                                ch_fetched.append(var + '_of')
                        elif var + '_erf' in table.columns:
                            ch_fetched.append(var + '_erf')
                        else:
                            ch_fetched.append(var)
                else:
                    print "Variables which need %s to be computed (some missing):\n %s \n" % (
                        col, str(ch_to_fetch))
                    for var in ch_to_fetch:

                        if var in simu_aggr_tables.columns:
                            if var + '_of' in table.columns:
                                ch_fetched.append(var + '_of')
                        elif var in erf_menage.columns:
                            if var + '_erf' in table.columns:
                                ch_fetched.append(var + '_erf')

                print table[[col] + ch_fetched][0:10]
                print "\n"
                del ch_to_fetch, ch_fetched
            '''
            For the parents
            '''
            def iter_on_parents(varcol):
                if (varcol._parents == set() and varcol._option
                        == {}) or varcol.name in already_met:
                    return
                else:
                    par_to_fetch = list(varcol._parents)
                    par_to_fetch = map(lambda x: x.name, par_to_fetch)
                    par_fetched = []

                    if set(par_fetched) <= set(simu_aggr_tables.columns):
                        #print "Variables the prestation %s depends of :\n %s \n" %(varcol.name, str(par_fetched))
                        for var in par_fetched:
                            if var + '_of' in table.columns:
                                par_fetched.append(var + '_of')
                            else:
                                par_fetched.append(var)
                    elif set(par_fetched) <= set(
                            simu_aggr_tables.columns).union(
                                erf_menage.columns):
                        #print "Variables the prestation %s depends of (some missing picked in erf):\n %s \n" %(varcol.name,str(par_fetched))
                        for var in par_fetched:
                            if var in simu_aggr_tables.columns:
                                if var + '_of' in table.columns:
                                    par_fetched.append(var + '_of')
                            elif var + '_erf' in table.columns:
                                par_fetched.append(var + '_erf')
                            else:
                                par_fetched.append(var)
                    else:
                        for var in par_fetched:
                            if var in simu_aggr_tables.columns:
                                if var + '_of' in table.columns:
                                    par_fetched.append(var + '_of')
                            elif var in erf_menage.columns:
                                if var + '_erf' in table.columns:
                                    par_fetched.append(var + '_erf')
                        if len(par_fetched) > 0:
                            #print "Variables the prestation %s depends of (some missing):\n %s \n" %(varcol.name, str(par_fetched))
                            pass
                        else:
                            #print "Variables the prestation %s depends of couldn't be found :\n %s \n" %(varcol.name, str(par_fetched))
                            pass

                    if len(par_fetched) > 0:
                        temp = table[[col, 'idmen'] + par_fetched][0:10]
                        bigtemp['table'] = pd.merge(temp,
                                                    bigtemp['table'],
                                                    how='inner')
                        #print temp.to_string(), "\n"
                    if varcol._option != {} and not set(
                            varcol._option.keys()) < set(options_met):
                        vars_to_fetch = list(
                            set(varcol._option.keys()) - set(options_met))
                        #print "and the options to current variable %s for the id's with strongest difference :\n %s \n" %(varcol.name, varcol._option.keys())
                        liste = [i for i in range(0, 10)]
                        liste = map(lambda x: table['idmen'].iloc[x], liste)
                        temp = simu_nonaggr_tables[
                            ['idmen', 'quimen', 'noindiv'] +
                            vars_to_fetch][simu_nonaggr_tables['idmen'].isin(
                                table['idmen'][0:10])]

                        temp_sorted = temp[temp['idmen'] == liste[0]]
                        for i in xrange(1, 10):
                            temp_sorted = temp_sorted.append(
                                temp[temp['idmen'] == liste[i]])
                        if bigtemp['options'] is None:
                            bigtemp['options'] = temp_sorted
                            bigtemp['options'] = bigtemp['options'].merge(
                                erf_eec_indivi, on='noindiv', how='outer')
                        else:
                            bigtemp['options'] = bigtemp['options'].merge(
                                temp_sorted,
                                on=['noindiv', 'idmen', 'quimen'],
                                how='outer')


#                         temp_sorted.set_index(['idmen',  'quimen'], drop = True, inplace = True) # If we do that
                        del temp, temp_sorted
                        gc.collect()

                    already_met.append(varcol.name)
                    options_met.extend(varcol._option.keys())
                    for var in varcol._parents:
                        iter_on_parents(var)

            iter_on_parents(varcol)
            # We merge the aggregate table with the option table ( for each individual in entity )
            bigtemp['table'] = bigtemp['table'].merge(bigtemp['options'],
                                                      how='left',
                                                      on='idmen',
                                                      suffixes=('(agg)',
                                                                '(ind)'))

            # Reshaping the table to group by descending error on col, common entities
            bigtemp['table'].sort(columns=['af', 'quimen'],
                                  ascending=[False, True],
                                  inplace=True)
            bigtemp['table'] = bigtemp['table'].groupby(['idmen', 'quimen'],
                                                        sort=False).sum()
            print "Table of values for %s dependencies : \n" % col
            print bigtemp['table'].to_string()
            del bigtemp['table'], bigtemp['options']
            gc.collect()
Beispiel #5
0
    def preproc(self):

        erf_menage = self.erf_menage
        erf_eec_indivi = self.erf_eec_indivi
        simu_aggr_tables = self.simu_aggr_tables
        simu_nonaggr_tables = self.simu_nonaggr_tables

        def get_all_ancestors(varlist):
            if len(varlist) == 0:
                return []
            else:
                if varlist[0]._parents == set():
                    return ([varlist[0]]
                          + get_all_ancestors(varlist[1:]))
                else:
                    return ([varlist[0]]
                     + get_all_ancestors(list(varlist[0]._parents))
                      + get_all_ancestors(varlist[1:]))

        # We want to get all ancestors + children + the options that we're going to encounter
        parents = map(lambda x: self.simulation.output_table.column_by_name.get(x), [self.variable])
        parents = get_all_ancestors(parents)
        options = []
        for varcol in parents:
            options.extend(varcol._option.keys())
        options = list(set(options))
        #print options
        parents = map(lambda x: x.name, parents)
        for var in [self.variable]:
            children = set()
            varcol = self.simulation.output_table.column_by_name.get(var)
            children = children.union(set(map(lambda x: x.name, varcol._children)))
        variables = list(set(parents + list(children)))
        #print variables
        del parents, children
        gc.collect()

        def get_var(variable):
            variables =[variable]
            return self.simulation.aggregated_by_entity(entity="men", variables=variables,
                                                    all_output_vars = False, force_sum=True)[0]

        simu_aggr_tables = get_var(variables[0])
        for var in variables[1:]:
            simu_aggr_tables = simu_aggr_tables.merge(get_var(var)[['idmen', var]], on = 'idmen', how = 'outer')
        # We load the data from erf table in case we have to pick data there
        erf_data = DataCollection(year= self.simulation.datesim.year)
        os.system('cls')
        todo = set(variables + ["ident", "wprm"]).union(set(options))
        print 'Variables or equivalents to fetch :'
        print todo

        '''
        Méthode générale pour aller chercher les variables de l'erf/eec
        ( qui n'ont pas forcément le même nom
        et parfois sont les variables utilisées pour créér l'of ):
        1 - essayer le get_of2erf, ça doit marcher pour les variables principales ( au moins les aggrégats
        que l'on compare )
        Si les variables ne sont pas directement dans la table,
        elles ont été calculées à partir d'autres variables de données erf/eec
        donc chercher dans :
        2 - build_survey
        3 - model/model.py qui dira éventuellement dans quel module de model/ chercher
        Le 'print todo' vous indique quelles variables chercher
        ( attention à ne pas inclure les enfants directs )
        L'utilisation du Ctrl-H est profitable !
        '''

        fetch_eec = ['statut','titc','chpub','encadr','prosa','age','naim','naia','noindiv']
        fetch_erf = ['zsali','af','ident','wprm','noi','noindiv','quelfic']
        erf_df = erf_data.get_of_values(variables= fetch_erf, table="erf_indivi")
        eec_df = erf_data.get_of_values(variables= fetch_eec, table="eec_indivi")
        erf_eec_indivi = erf_df.merge(eec_df, on ='noindiv', how = 'inner' )
        assert 'quelfic' in erf_eec_indivi.columns, "quelfic not in erf_indivi columns"
        del eec_df, erf_df

        # We then get the aggregate variables for the menage ( mainly to compare with of )
        print 'Loading data from erf_menage table'
        erf_menage = erf_data.get_of_values(variables= list(todo) + ['quelfic'], table="erf_menage")

        del todo
        gc.collect()
        assert 'ident' in erf_menage.columns, "ident not in erf_menage.columns"

        from openfisca_france.data.erf import get_erf2of
        erf2of = get_erf2of()
        erf_menage.rename(columns = erf2of, inplace = True)

    # We get the options from the simulation non aggregated tables:

        # First from the output_table
        # We recreate the noindiv in output_table
        self.simulation.output_table.table['noindiv'] = 100 * self.simulation.output_table.table.idmen_ind + self.simulation.output_table.table.noi_ind
        self.simulation.output_table.table['noindiv'] = self.simulation.output_table.table['noindiv'].astype(np.int64)
        s1 = [var for var in set(options).intersection(set(self.simulation.output_table.table.columns))] + ['idmen_ind', 'quimen_ind', 'noindiv']
        simu_nonaggr_tables = (self.simulation.output_table.table)[s1]
        simu_nonaggr_tables.rename(columns = {'idmen_ind' : 'idmen', 'quimen_ind':'quimen'}, inplace = True)
        assert 'noindiv' in simu_nonaggr_tables.columns

        # If not found, we dwelve into the input_table
        if (set(s1)- set(['idmen_ind', 'quimen_ind','noindiv'])) < set(options):
            assert 'noindiv' in self.simulation.input_table.table.columns, "'noindiv' not in simulation.input_table.table.columns"
            s2 = [var for var in (set(options).intersection(set(self.simulation.input_table.table.columns)) - set(s1))] + ['noindiv']
            #print s2
            temp = self.simulation.input_table.table[s2]
            simu_nonaggr_tables = simu_nonaggr_tables.merge(temp, on = 'noindiv', how = 'inner', sort = False)

            del s2, temp
        del s1
        gc.collect()

        simu_nonaggr_tables = simu_nonaggr_tables[list(set(options)) + ['idmen', 'quimen','noindiv']]
        #print options, variables
        assert 'idmen' in simu_nonaggr_tables.columns, 'Idmen not in simu_nonaggr_tables columns'

        # Check the idmens that are not common
        erf_menage.rename(columns = {'ident' : 'idmen'}, inplace = True)

        print "\n"
        print 'Checking if idmen is here...'
        print '\n ERF : '
        print 'idmen' in erf_menage.columns
        print "\n Simulation output"
        print 'idmen' in simu_aggr_tables.columns
        print "\n"

        #print 'Dropping duplicates of idmen for both tables...'
        assert not erf_menage["idmen"].duplicated().any(), "Duplicated idmen in erf_menage"
        #erf_menage.drop_duplicates('idmen', inplace = True)
        simu_aggr_tables.drop_duplicates('idmen', inplace = True)
        assert not simu_aggr_tables["idmen"].duplicated().any(), "Duplicated idmen in of"

        print 'Checking mismatching idmen... '
        s1 = set(erf_menage['idmen']) - (set(simu_aggr_tables['idmen']))
        if s1:
            print "idmen that aren't in simu_aggr_tables : %s" %str(len(s1))
            pass
        s2 = (set(simu_aggr_tables['idmen'])) - set(erf_menage['idmen'])
        if s2:
            print "idmen that aren't in erf_menage : %s" %str(len(s2))
            pass
        del s1, s2

        # Restrict to common idmens and merge
        s3 = set(erf_menage['idmen']).intersection(set(simu_aggr_tables['idmen']))
        print "Restricting to %s common idmen... \n" %str(len(s3))
        erf_menage = erf_menage[erf_menage['idmen'].isin(s3)]
        simu_aggr_tables = simu_aggr_tables[simu_aggr_tables['idmen'].isin(s3)]
        del s3
        gc.collect()

        #print erf_menage.columns
        #print simu_aggr_tables.columns

        # Compare differences across of and erf dataframes
        print "Comparing differences between dataframes... \n"
        colcom = (set(erf_menage.columns).intersection(set(simu_aggr_tables.columns))) - set(['idmen','wprm'])
        print 'Common variables: '
        print colcom
        erf_menage.reset_index(inplace = True)
        simu_aggr_tables.reset_index(inplace = True)
        for col in colcom:
            temp = set(erf_menage['idmen'][erf_menage[col] != simu_aggr_tables[col]])
            print "Numbers of idmen that aren't equal on variable %s : %s \n" %(col, str(len(temp)))
            del temp

        self.erf_menage = erf_menage
        self.erf_eec_indivi = erf_eec_indivi
        self.simu_aggr_tables = simu_aggr_tables
        self.simu_nonaggr_tables = simu_nonaggr_tables
Beispiel #6
0
def test(year=2006, variables = ['af']):
    simulation = SurveySimulation()
    survey_filename = os.path.join(model.DATA_DIR, 'sources', 'test.h5')
    simulation.set_config(year=year, survey_filename=survey_filename)
    simulation.set_param()
    simulation.compute()

#     of_aggregates = Aggregates()
#     of_aggregates.set_simulation(simulation)
#     of_aggregates.compute()
#     print of_aggregates.aggr_frame
#
#     from openfisca_france.data.erf.aggregates import build_erf_aggregates
#     temp = (build_erf_aggregates(variables=variables, year= year))
#     print temp
#     return
    variable= "af"
    debugger = Debugger()
    debugger.set_simulation(simulation)
    debugger.set_variable(variable)
    debugger.show_aggregates()





    def get_all_ancestors(varlist):
        if len(varlist) == 0:
            return []
        else:
            if varlist[0]._parents == set():
                return ([varlist[0]]
                      + get_all_ancestors(varlist[1:]))
            else:
                return ([varlist[0]]
                 + get_all_ancestors(list(varlist[0]._parents))
                  + get_all_ancestors(varlist[1:]))

    # We want to get all ancestors + children + the options that we're going to encounter
    parents = map(lambda x: simulation.output_table.column_by_name.get(x), variables)
    parents = get_all_ancestors(parents)
    options = []
    for varcol in parents:
        options.extend(varcol._option.keys())
    options = list(set(options))
    #print options
    parents = map(lambda x: x.name, parents)
    for var in variables:
        children = set()
        varcol = simulation.output_table.column_by_name.get(var)
        children = children.union(set(map(lambda x: x.name, varcol._children)))
    variables = list(set(parents + list(children)))
    #print variables
    del parents, children
    gc.collect()

    def get_var(variable):
        variables =[variable]
        return simulation.aggregated_by_entity(entity="men", variables=variables,
                                                all_output_vars = False, force_sum=True)[0]

    simu_aggr_tables = get_var(variables[0])
    for var in variables[1:]:
        simu_aggr_tables = simu_aggr_tables.merge(get_var(var)[['idmen', var]], on = 'idmen', how = 'outer')

    # We load the data from erf table in case we have to pick data there
    erf_data = DataCollection(year=year)
    os.system('cls')
    todo = set(variables + ["ident", "wprm"]).union(set(options))
    print 'Variables or equivalents to fetch :'
    print todo

    '''
    Méthode générale pour aller chercher les variables de l'erf/eec
    ( qui n'ont pas forcément le même nom
    et parfois sont les variables utilisées pour créér l'of ):
    1 - essayer le get_of2erf, ça doit marcher pour les variables principales ( au moins les aggrégats
    que l'on compare )
    Si les variables ne sont pas directement dans la table,
    elles ont été calculées à partir d'autres variables de données erf/eec
    donc chercher dans :
    2 - build_survey
    3 - model/model.py qui dira éventuellement dans quel module de model/ chercher
    Le 'print todo' vous indique quelles variables chercher
    ( attention à ne pas inclure les enfants directs )
    L'utilisation du Ctrl-H est profitable !
    '''

    fetch_eec = ['statut','titc','chpub','encadr','prosa','age','naim','naia','noindiv']
    fetch_erf = ['zsali','af','ident','wprm','noi','noindiv','quelfic']
    erf_df = erf_data.get_of_values(variables= fetch_erf, table="erf_indivi")
    eec_df = erf_data.get_of_values(variables= fetch_eec, table="eec_indivi")
    erf_eec_indivi = erf_df.merge(eec_df, on ='noindiv', how = 'inner' )
    assert 'quelfic' in erf_eec_indivi.columns, "quelfic not in erf_indivi columns"
    del eec_df, erf_df

    # We then get the aggregate variables for the menage ( mainly to compare with of )
    print 'Loading data from erf_menage table'
    erf_menage = erf_data.get_of_values(variables= list(todo) + ['quelfic'], table="erf_menage")

    del todo
    gc.collect()
    assert 'ident' in erf_menage.columns, "ident not in erf_menage.columns"

    from openfisca_france.data.erf import get_erf2of
    erf2of = get_erf2of()
    erf_menage.rename(columns = erf2of, inplace = True)

# We get the options from the simulation non aggregated tables:

    # First from the output_table
    # We recreate the noindiv in output_table
    simulation.output_table.table['noindiv'] = 100 * simulation.output_table.table.idmen_ind + simulation.output_table.table.noi_ind
    simulation.output_table.table['noindiv'] = simulation.output_table.table['noindiv'].astype(np.int64)
    s1 = [var for var in set(options).intersection(set(simulation.output_table.table.columns))] + ['idmen_ind', 'quimen_ind', 'noindiv']
    simu_nonaggr_tables = (simulation.output_table.table)[s1]
    simu_nonaggr_tables.rename(columns = {'idmen_ind' : 'idmen', 'quimen_ind':'quimen'}, inplace = True)
    assert 'noindiv' in simu_nonaggr_tables.columns

    # If not found, we dwelve into the input_table
    if (set(s1)- set(['idmen_ind', 'quimen_ind','noindiv'])) < set(options):
        assert 'noindiv' in simulation.input_table.table.columns, "'noindiv' not in simulation.input_table.table.columns"
        s2 = [var for var in (set(options).intersection(set(simulation.input_table.table.columns)) - set(s1))] + ['noindiv']
        #print s2
        temp = simulation.input_table.table[s2]
        simu_nonaggr_tables = simu_nonaggr_tables.merge(temp, on = 'noindiv', how = 'inner', sort = False)

        del s2, temp
    del s1
    gc.collect()

    simu_nonaggr_tables = simu_nonaggr_tables[list(set(options)) + ['idmen', 'quimen','noindiv']]
    #print options, variables
    assert 'idmen' in simu_nonaggr_tables.columns, 'Idmen not in simu_nonaggr_tables columns'

    # Check the idmens that are not common
    erf_menage.rename(columns = {'ident' : 'idmen'}, inplace = True)

    print "\n"
    print 'Checking if idmen is here...'
    print '\n ERF : '
    print 'idmen' in erf_menage.columns
    print "\n Simulation output"
    print 'idmen' in simu_aggr_tables.columns
    print "\n"

    #print 'Dropping duplicates of idmen for both tables...'
    assert not erf_menage["idmen"].duplicated().any(), "Duplicated idmen in erf_menage"
    #erf_menage.drop_duplicates('idmen', inplace = True)
    simu_aggr_tables.drop_duplicates('idmen', inplace = True)
    assert not simu_aggr_tables["idmen"].duplicated().any(), "Duplicated idmen in of"

    print 'Checking mismatching idmen... '
    s1 = set(erf_menage['idmen']) - (set(simu_aggr_tables['idmen']))
    if s1:
        print "idmen that aren't in simu_aggr_tables : %s" %str(len(s1))
        pass
    s2 = (set(simu_aggr_tables['idmen'])) - set(erf_menage['idmen'])
    if s2:
        print "idmen that aren't in erf_menage : %s" %str(len(s2))
        pass
    del s1, s2

    # Restrict to common idmens and merge
    s3 = set(erf_menage['idmen']).intersection(set(simu_aggr_tables['idmen']))
    print "Restricting to %s common idmen... \n" %str(len(s3))
    erf_menage = erf_menage[erf_menage['idmen'].isin(s3)]
    simu_aggr_tables = simu_aggr_tables[simu_aggr_tables['idmen'].isin(s3)]
    del s3
    gc.collect()

    #print erf_menage.columns
    #print simu_aggr_tables.columns

    # Compare differences across of and erf dataframes
    print "Comparing differences between dataframes... \n"
    colcom = (set(erf_menage.columns).intersection(set(simu_aggr_tables.columns))) - set(['idmen','wprm'])
    print 'Common variables: '
    print colcom
    erf_menage.reset_index(inplace = True)
    simu_aggr_tables.reset_index(inplace = True)
    for col in colcom:
        temp = set(erf_menage['idmen'][erf_menage[col] != simu_aggr_tables[col]])
        print "Numbers of idmen that aren't equal on variable %s : %s \n" %(col, str(len(temp)))
        del temp


    # Detect the biggest differences
    bigtable = merge(erf_menage, simu_aggr_tables, on = 'idmen', how = 'inner', suffixes=('_erf','_of'))
    print 'Length of new dataframe is %s' %str(len(bigtable))
    #print bigtable.columns
    bigtable.set_index('idmen', drop = False, inplace = True)

    already_met = []
    options_met = []

    for col in colcom:
        bigtemp = None
        table = bigtable[and_(bigtable[col+'_erf']!=0,bigtable[col+'_of']!=0)]
        table[col] = (table[col+'_erf'] - table[col+'_of']) / table[col+'_erf'] #Difference relative
        table[col] = table[col].apply(lambda x: abs(x))
        print 'Minimum difference between the two tables for %s is %s' %(col, str(table[col].min()))
        print 'Maximum difference between the two tables for %s is %s' %(col, str(table[col].max()))
        print table[col].describe()
        try:
            assert len(table[col]) == len(table['wprm_of']), "PINAGS"
            dec, values = mwp(table[col], np.arange(1,11), table['wprm_of'], 2, return_quantiles=True)
            #print sorted(values)
            dec, values = mwp(table[col], np.arange(1,101), table['wprm_erf'], 2, return_quantiles=True)
            #print sorted(values)[90:]
            del dec, values
            gc.collect()
        except:
            #print 'Weighted percentile method didnt work for %s' %col
            pass
        print "\n"

    # Show the relevant information for the most deviant households
        table.sort(columns = col, ascending = False, inplace = True)
        #print table[col][0:10].to_string()
        if bigtemp is None:
            bigtemp = {'table' : table[[col, col+'_of', col+'_erf', 'idmen']][0:10],
                       'options' : None}
        bigtemp['table'][col+'div'] = bigtemp['table'][col+'_of'] / bigtemp['table'][col+'_erf']
        print bigtemp['table'].to_string()

        '''
        bigtemp is the table which will get filled little by little by the relevant variables.
        Up to the last rows of code 'table' refers to a table of aggregated values,
        while 'options is a table of individual variables.
        The reason we call it in a dictionnary is also because we modify it inside the recursive function 'iter_on parents',
        and it causes an error in Python unless for certain types like dictionnary values.
        '''
        #print "\n"

        # If variable is a Prestation, we show the dependancies
        varcol = simulation.output_table.column_by_name.get(col)
        if isinstance(varcol, Prestation):

            '''
            For the direct children
            '''
            if not varcol._children is None:
                ch_to_fetch = list(varcol._children)
                ch_to_fetch = map(lambda x: x.name, ch_to_fetch)
                ch_fetched = []

                if set(ch_to_fetch) <= set(simu_aggr_tables.columns):
                    print "Variables which need %s to be computed :\n %s \n" %(col, str(ch_to_fetch))
                    for var in ch_to_fetch:
                        if var + '_of' in table.columns:
                            ch_fetched.append(var + '_of')
                        else:
                            ch_fetched.append(var)
                elif set(ch_to_fetch) <= set(simu_aggr_tables.columns).union(erf_menage.columns):
                    print "Variables which need %s to be computed (some missing picked in erf):\n %s \n" %(col, str(ch_to_fetch))
                    for var in ch_to_fetch:
                        if var in simu_aggr_tables.columns:
                            if var + '_of' in table.columns:
                                ch_fetched.append(var + '_of')
                        elif var + '_erf' in table.columns:
                                ch_fetched.append(var + '_erf')
                        else:
                            ch_fetched.append(var)
                else:
                    print "Variables which need %s to be computed (some missing):\n %s \n" %(col, str(ch_to_fetch))
                    for var in ch_to_fetch:

                        if var in simu_aggr_tables.columns:
                            if var + '_of' in table.columns:
                                ch_fetched.append(var + '_of')
                        elif var in erf_menage.columns:
                            if var + '_erf' in table.columns:
                                ch_fetched.append(var + '_erf')

                print table[[col] + ch_fetched][0:10]
                print "\n"
                del ch_to_fetch, ch_fetched

            '''
            For the parents
            '''
            def iter_on_parents(varcol):
                if (varcol._parents == set() and varcol._option == {}) or varcol.name in already_met:
                    return
                else:
                    par_to_fetch = list(varcol._parents)
                    par_to_fetch = map(lambda x: x.name, par_to_fetch)
                    par_fetched = []

                    if set(par_fetched) <= set(simu_aggr_tables.columns):
                        #print "Variables the prestation %s depends of :\n %s \n" %(varcol.name, str(par_fetched))
                        for var in par_fetched:
                            if var + '_of' in table.columns:
                                par_fetched.append(var + '_of')
                            else:
                                par_fetched.append(var)
                    elif set(par_fetched) <= set(simu_aggr_tables.columns).union(erf_menage.columns):
                        #print "Variables the prestation %s depends of (some missing picked in erf):\n %s \n" %(varcol.name,str(par_fetched))
                        for var in par_fetched:
                            if var in simu_aggr_tables.columns:
                                if var + '_of' in table.columns:
                                    par_fetched.append(var + '_of')
                            elif var + '_erf' in table.columns:
                                par_fetched.append(var + '_erf')
                            else:
                                par_fetched.append(var)
                    else:
                        for var in par_fetched:
                            if var in simu_aggr_tables.columns:
                                if var + '_of' in table.columns:
                                    par_fetched.append(var + '_of')
                            elif var in erf_menage.columns:
                                if var + '_erf' in table.columns:
                                    par_fetched.append(var + '_erf')
                        if len(par_fetched) > 0:
                            #print "Variables the prestation %s depends of (some missing):\n %s \n" %(varcol.name, str(par_fetched))
                            pass
                        else:
                            #print "Variables the prestation %s depends of couldn't be found :\n %s \n" %(varcol.name, str(par_fetched))
                            pass

                    if len(par_fetched) > 0:
                        temp = table[[col, 'idmen'] + par_fetched][0:10]
                        bigtemp['table'] = pd.merge(temp, bigtemp['table'], how = 'inner')
                        #print temp.to_string(), "\n"
                    if varcol._option != {} and not set(varcol._option.keys()) < set(options_met):
                        vars_to_fetch = list(set(varcol._option.keys())-set(options_met))
                        #print "and the options to current variable %s for the id's with strongest difference :\n %s \n" %(varcol.name, varcol._option.keys())
                        liste = [i for i in range(0,10)]
                        liste = map(lambda x: table['idmen'].iloc[x], liste)
                        temp = simu_nonaggr_tables[['idmen', 'quimen','noindiv']
                                                  + vars_to_fetch][simu_nonaggr_tables['idmen'].isin(table['idmen'][0:10])]

                        temp_sorted = temp[temp['idmen'] == liste[0]]
                        for i in xrange(1,10):
                            temp_sorted = temp_sorted.append(temp[temp['idmen'] == liste[i]])
                        if bigtemp['options'] is None:
                            bigtemp['options'] = temp_sorted
                            bigtemp['options'] = bigtemp['options'].merge(erf_eec_indivi, on = 'noindiv', how = 'outer')
                        else:
                            bigtemp['options'] = bigtemp['options'].merge(temp_sorted, on = ['noindiv','idmen','quimen'], how = 'outer')
#                         temp_sorted.set_index(['idmen',  'quimen'], drop = True, inplace = True) # If we do that
                        del temp, temp_sorted
                        gc.collect()

                    already_met.append(varcol.name)
                    options_met.extend(varcol._option.keys())
                    for var in varcol._parents:
                        iter_on_parents(var)

            iter_on_parents(varcol)
            # We merge the aggregate table with the option table ( for each individual in entity )
            bigtemp['table'] = bigtemp['table'].merge(bigtemp['options'],
                                                       how = 'left',
                                                        on = 'idmen',
                                                         suffixes = ('(agg)', '(ind)'))

            # Reshaping the table to group by descending error on col, common entities
            bigtemp['table'].sort(columns = ['af','quimen'], ascending = [False,True], inplace = True)
            bigtemp['table'] = bigtemp['table'].groupby(['idmen','quimen'], sort = False).sum()
            print "Table of values for %s dependencies : \n" %col
            print bigtemp['table'].to_string()
            del bigtemp['table'], bigtemp['options']
            gc.collect()
Beispiel #7
0
def check_converted():
    #Retrieving the input and output files for analysis :
    store = HDFStore(survey_test)
    input_df = store['survey_2006']

    output = HDFStore(survey3_test)

    df_fam = output['survey_2006/fam']
    df_foy = output['survey_2006/foy']
    df_men = output['survey_2006/men']
    df_ind = output['survey_2006/ind']

    year = 2006
    erf = DataCollection(year=year)
    df = erf.get_of_values(table = "erf_indivi")
    df2 = erf.get_of_values(table = "eec_indivi")
    print '\n'
    print df.loc[df.ident==6030189, :].to_string()
    print df2.loc[df2.ident==6030189, :].to_string()

    print len(np.unique(input_df['idfoy'].values))
    print len(np.unique(input_df.loc[input_df['quifoy']==0,'idfoy'].values))

    liste = [601228002, 602671302, 602016402, 603069602, 601365902, 602679402, 602680905, 603074902, 600848302,
             602684902, 601508802, 601427302, 601774602, 600466102, 603448202, 603091202, 602437502, 603224003,
             603093102, 601261802, 601000002, 601789602, 601660602, 600350102, 601927802, 601797902, 601667902,
             601537502, 600227602, 602854502, 602071902, 600144702, 602205702, 600769302, 601096602, 602609202,
             601301302, 602220302, 602486102, 601376802, 601570902, 600654802, 601443202, 603412402, 603412902,
             601055502, 602893001, 601189902, 601850602, 600539902, 602507002, 601460902, 602511602, 601200902,
             601601802, 600946903, 600428502, 600953502, 601084802, 601350102, 600829602, 600174402]
    liste_men = np.unique(input_df.loc[input_df.idfoy.isin(liste), 'idmen'].values)
    print liste_men
    print df.loc[df.ident.isin(liste_men), ['noi', 'noindiv', 'ident', 'declar1', 'declar2', 'persfip', 'persfipd', 'quelfic']].head(30).to_string()
    print input_df.loc[input_df.idfoy.isin(liste), :].head(30).to_string()

#     print input_df.loc[input_df.idfoy==603018901,
#                        ['idfoy', 'quifoy', 'idfam', 'quifam', 'idmen', 'quimen', 'noi']].to_string()
#
#     print input_df.loc[input_df.idfam==603018902,
#                    ['idfoy', 'quifoy', 'idfam', 'quifam', 'idmen', 'quimen', 'noi']].to_string()
    return
#     df_foy['noindiv'] = df_foy['noi'] ; del df_foy['noi']
#     df_fam['noindiv'] = df_fam['noi'] ; del df_fam['noi']
#     df_men['noindiv'] = df_men['noi'] ; del df_men['noi']
#     print df_fam, df_foy, df_men

#     check_structure(store['survey_2006'])
#     control(input_df, verbose=True, verbose_columns=['noindiv'])
#     control(df_foy, verbose=True, verbose_columns=['noindiv'])

#     print input_df.duplicated('noindiv').sum(), len(input_df)
#     print df_foy.duplicated('noindiv').sum(), len(df_foy)
#     print df_fam.duplicated('noindiv').sum(), len(df_fam)
#     print df_men.duplicated('noindiv').sum(), len(df_men)
#     print df_ind.head(10).to_string()
    print '    FAM'
    print sorted(df_fam.columns)
    print '    FOY'
    print sorted(df_foy.columns)
    print '    MEN'
    print sorted(df_men.columns)
    print '    IND'
    print sorted(df_ind.columns)

#     print df_fam.columns
    print '    INPUT'
    print sorted(input_df.columns)
Beispiel #8
0
def check_converted():
    #Retrieving the input and output files for analysis :
    store = HDFStore(survey_test)
    input_df = store['survey_2006']

    output = HDFStore(survey3_test)

    df_fam = output['survey_2006/fam']
    df_foy = output['survey_2006/foy']
    df_men = output['survey_2006/men']
    df_ind = output['survey_2006/ind']

    year = 2006
    erf = DataCollection(year=year)
    df = erf.get_of_values(table="erf_indivi")
    df2 = erf.get_of_values(table="eec_indivi")
    print '\n'
    print df.loc[df.ident == 6030189, :].to_string()
    print df2.loc[df2.ident == 6030189, :].to_string()

    print len(np.unique(input_df['idfoy'].values))
    print len(np.unique(input_df.loc[input_df['quifoy'] == 0, 'idfoy'].values))

    liste = [
        601228002, 602671302, 602016402, 603069602, 601365902, 602679402,
        602680905, 603074902, 600848302, 602684902, 601508802, 601427302,
        601774602, 600466102, 603448202, 603091202, 602437502, 603224003,
        603093102, 601261802, 601000002, 601789602, 601660602, 600350102,
        601927802, 601797902, 601667902, 601537502, 600227602, 602854502,
        602071902, 600144702, 602205702, 600769302, 601096602, 602609202,
        601301302, 602220302, 602486102, 601376802, 601570902, 600654802,
        601443202, 603412402, 603412902, 601055502, 602893001, 601189902,
        601850602, 600539902, 602507002, 601460902, 602511602, 601200902,
        601601802, 600946903, 600428502, 600953502, 601084802, 601350102,
        600829602, 600174402
    ]
    liste_men = np.unique(input_df.loc[input_df.idfoy.isin(liste),
                                       'idmen'].values)
    print liste_men
    print df.loc[df.ident.isin(liste_men), [
        'noi', 'noindiv', 'ident', 'declar1', 'declar2', 'persfip', 'persfipd',
        'quelfic'
    ]].head(30).to_string()
    print input_df.loc[input_df.idfoy.isin(liste), :].head(30).to_string()

    #     print input_df.loc[input_df.idfoy==603018901,
    #                        ['idfoy', 'quifoy', 'idfam', 'quifam', 'idmen', 'quimen', 'noi']].to_string()
    #
    #     print input_df.loc[input_df.idfam==603018902,
    #                    ['idfoy', 'quifoy', 'idfam', 'quifam', 'idmen', 'quimen', 'noi']].to_string()
    return
    #     df_foy['noindiv'] = df_foy['noi'] ; del df_foy['noi']
    #     df_fam['noindiv'] = df_fam['noi'] ; del df_fam['noi']
    #     df_men['noindiv'] = df_men['noi'] ; del df_men['noi']
    #     print df_fam, df_foy, df_men

    #     check_structure(store['survey_2006'])
    #     control(input_df, verbose=True, verbose_columns=['noindiv'])
    #     control(df_foy, verbose=True, verbose_columns=['noindiv'])

    #     print input_df.duplicated('noindiv').sum(), len(input_df)
    #     print df_foy.duplicated('noindiv').sum(), len(df_foy)
    #     print df_fam.duplicated('noindiv').sum(), len(df_fam)
    #     print df_men.duplicated('noindiv').sum(), len(df_men)
    #     print df_ind.head(10).to_string()
    print '    FAM'
    print sorted(df_fam.columns)
    print '    FOY'
    print sorted(df_foy.columns)
    print '    MEN'
    print sorted(df_men.columns)
    print '    IND'
    print sorted(df_ind.columns)

    #     print df_fam.columns
    print '    INPUT'
    print sorted(input_df.columns)