Beispiel #1
0
def build_aggregates():
#    writer = None
    years = range(2009,2010)
    for year in years:
        yr = str(year)
#        fname = "Agg_%s.%s" %(str(yr), "xls")
        simu = SurveySimulation()
        simu.set_config(year = yr)
        simu.set_param()
        simu.set_survey()
        simu.compute()
        variables = ["garext", "ci_garext", "inthab", "ppe_brute", "rni"]
        x = simu.aggregated_by_entity("men", variables, all_output_vars = False)
#        df = x[0]
#        print df["ci_garext"].describe()
        agg = Aggregates()
        agg.set_simulation(simu)
        agg.show_default = False
        agg.show_real = False
        agg.show_diff = False
        agg.varlist = var_list  # ERROR: var_list is undefined.
        agg.compute()
        cols = agg.aggr_frame.columns[:4]
        print agg.aggr_frame[cols].to_string()
#        if writer is None:
#            writer = ExcelWriter(str(fname_all))
#        agg.aggr_frame.to_excel(writer, yr, index= False, header= True)
        del simu
        del agg
        import gc
        gc.collect()
Beispiel #2
0
def get_common_dataframe(variables, year = 2006):
    """
    Compare variables in erf an openfisca
    """
    simulation = SurveySimulation()
    simulation.set_config(year = year)
    simulation.set_param()
    simulation.set_survey()
    simulation.compute()

    erf = ErfsDataTable(year=2006)
    if "ident" not in variables:
        erf_variables = variables + ["ident"]
    else:
        erf_variables = variables

    if "wprm" not in erf_variables:
        erf_variables = erf_variables + ["wprm"]
    else:
        erf_variables = erf_variables

    erf_dataframe = erf.get_values(erf_variables, table="menage")
    erf_dataframe.rename(columns={'ident': 'idmen'}, inplace=True)
    for col in erf_dataframe.columns:
        if col is not "idmen":
            erf_dataframe.rename(columns={col: col + "_erf"}, inplace=True)

    of_dataframe, of_dataframe_default = simulation.aggregated_by_entity("men", variables, all_output_vars=False, force_sum=True)
    del of_dataframe_default

    merged_df = of_dataframe.merge(erf_dataframe, on="idmen")
    del of_dataframe, erf_dataframe
    return merged_df
Beispiel #3
0
def test_chunk():
    print "debut"
    writer = None
    years = range(2011, 2012)
    filename = destination_dir + 'output3.h5'
    store = HDFStore(filename)
    for year in years:
        yr = str(year)
        #        fname = "Agg_%s.%s" %(str(yr), "xls")
        simu = SurveySimulation()
        simu.set_config(year=yr)
        simu.set_param()
        import time

        tps = {}
        for nb_chunk in range(1, 5):
            deb_chunk = time.clock()
            simu.set_config(survey_filename='C:\\Til\\output\\to_run_leg.h5',
                            num_table=3,
                            chunks_count=nb_chunk,
                            print_missing=False)
            simu.compute()
            tps[nb_chunk] = time.clock() - deb_chunk

            voir = simu.output_table.table3['foy']
            print len(voir)
            pdb.set_trace()
            agg3 = Aggregates()
            agg3.set_simulation(simu)
            agg3.compute()
            df1 = agg3.aggr_frame
            print df1.to_string()

    print tps
    store.close()
def convert_to_3_tables(year=2006, survey_file=None, output_file=None):

    if survey_file is None:
        raise Exception(
            'You need a .h5 file with the survey to extract the variables from'
        )
    if output_file is None:
        output_file = survey_file
        raise Warning(
            'the survey file will be used to store the created tables')

    store = HDFStore(survey_file)
    output = HDFStore(output_file)
    print output

    simulation = SurveySimulation()
    simulation.set_config(year=year)
    table1 = store['survey_' + str(year)]

    for entity in ['ind', 'foy', 'men', 'fam']:
        key = 'survey_' + str(year) + '/' + str(entity)

        vars_matching_entity = vars_matching_entity_from_table(
            table1, simulation, entity)
        print entity, vars_matching_entity_from_table
        print 'table1 enum'

        if entity == 'ind':
            print 'INDIVIDUALS'
            print table1['noindiv']
            table_entity = table1.loc[:, vars_matching_entity]

        # we take care have all ident and selecting qui==0
        else:
            #             print '    entity :', entity
            #             print table1['noindiv'].head()
            position = 'qui' + entity
            #             print table1[position]
            table_entity = table1.ix[table1[position] == 0, [
                'noi', 'idmen', 'idfoy', 'idfam', 'quifoy', 'quimen', 'quifam'
            ] + vars_matching_entity]
            #             print table_entity.noi.head()
            table_entity = table_entity.rename_axis(table_entity['id' +
                                                                 entity],
                                                    axis=1)


#             print '    APRES'
#             print table_entity.noi.head()
        print key
        output.put(key, table_entity)

    del table1
    import gc
    gc.collect()

    store.close()
    output.close()
Beispiel #5
0
def test_chunk():
    print "debut"
    writer = None
    years = range(2011,2012)
    filename = destination_dir+'output3.h5'
    store = HDFStore(filename)
    for year in years:
        yr = str(year)
#        fname = "Agg_%s.%s" %(str(yr), "xls")
        simu = SurveySimulation()
        simu.set_config(year = yr)
        simu.set_param()
        import time

        tps = {}
        for nb_chunk in range(1,5):
            deb_chunk = time.clock()
            simu.set_config(survey_filename='C:\\Til\\output\\to_run_leg.h5', num_table=3, chunks_count=nb_chunk ,
                            print_missing=False)
            simu.compute()
            tps[nb_chunk] = time.clock() - deb_chunk

            voir = simu.output_table.table3['foy']
            print len(voir)
            pdb.set_trace()
            agg3 = Aggregates()
            agg3.set_simulation(simu)
            agg3.compute()
            df1 = agg3.aggr_frame
            print df1.to_string()

    print tps
    store.close()
Beispiel #6
0
def test():
    yr = 2006
    simu = SurveySimulation()
    simu.set_config(year=yr)
    simu.set_param()
    filename = os.path.join(model.DATA_DIR, 'survey_psl.h5')
    simu.set_survey(filename=filename)
    simu.compute()

    df = get_structure(simu, 'br_al')
    print df.to_string()
Beispiel #7
0
def toto():
    year = 2006

    simulation = SurveySimulation()
    simulation.set_config(year=year)
    simulation.set_param()
    simulation.set_survey()
    simulation.compute()

    for name, col in simulation.output_table.column_by_name.iteritems():
        print col.name
        print col._dtype
        print col.entity
Beispiel #8
0
def survey_case(year):

    #        fname = "Agg_%s.%s" %(str(yr), "xls")
    simulation = SurveySimulation()
    simulation.set_config(year=year, num_table=1, reforme=True)
    simulation.set_param()
    simulation.P.ir.autre.charge_loyer.plaf = 500
    simulation.P.ir.autre.charge_loyer.active = 1
    simulation.P.ir.autre.charge_loyer.plaf_nbp = 0

    # plaf=1000 plaf_nbp=0: -42160, =1: -41292
    # plaf=500  plaf_nbp=0: -43033, =1: -42292

    # Bareme threshold reduction in pct
    reduc = .1
    print simulation.P.ir.bareme
    print simulation.P.ir.bareme.nb
    for i in range(2, simulation.P.ir.bareme.nb):
        simulation.P.ir.bareme.setSeuil(
            i, simulation.P.ir.bareme.seuils[i] * (1 - reduc))

    print simulation.P.ir.bareme
    print simulation.P.ir.bareme.nb
    simulation.compute()

    # Compute aggregates
    agg = Aggregates()
    agg.set_simulation(simulation)
    agg.compute()

    df1 = agg.aggr_frame
    print df1.to_string()
    return
def test():
    country = "france"
    for year in range(2006,2010):

        yr = str(year)
        simu = SurveySimulation()
        simu.set_config(year = yr, country = country)
        simu.set_param()

        agg = Aggregates()
        agg.set_simulation(simu)

        for col in agg.varlist:
            print col
def convert_to_3_tables(year=2006, survey_file=None, output_file=None):

    if survey_file is None:
        raise Exception('You need a .h5 file with the survey to extract the variables from')
    if output_file is None:
        output_file = survey_file
        raise Warning('the survey file will be used to store the created tables')

    store = HDFStore(survey_file)
    output = HDFStore(output_file)
    print output

    simulation = SurveySimulation()
    simulation.set_config(year=year)
    table1 = store['survey_'+str(year)]

    for entity in ['ind','foy','men','fam']:
        key = 'survey_'+str(year) + '/'+str(entity)

        vars_matching_entity = vars_matching_entity_from_table(table1, simulation, entity)
        print entity, vars_matching_entity_from_table
        print 'table1 enum'

        if entity == 'ind':
            print 'INDIVIDUALS'
            print table1['noindiv']
            table_entity = table1.loc[:, vars_matching_entity]

        # we take care have all ident and selecting qui==0
        else:
#             print '    entity :', entity
#             print table1['noindiv'].head()
            position = 'qui'+entity
#             print table1[position]
            table_entity = table1.ix[table1[position] == 0 ,['noi','idmen','idfoy','idfam','quifoy','quimen','quifam'] +
                                                        vars_matching_entity]
#             print table_entity.noi.head()
            table_entity= table_entity.rename_axis(table_entity['id'+entity], axis=1)
#             print '    APRES'
#             print table_entity.noi.head()
        print key
        output.put(key, table_entity)

    del table1
    import gc
    gc.collect()

    store.close()
    output.close()
Beispiel #11
0
def test():
    yr = 2006
    simu = SurveySimulation()
    simu.set_config(year = yr)
    simu.set_param()
    filename = os.path.join(model.DATA_DIR, 'survey_psl.h5')
    simu.set_survey(filename = filename)
    simu.compute()

    df = get_structure(simu, 'br_al')
    print df.to_string()
Beispiel #12
0
def toto():
    year = 2006

    simulation = SurveySimulation()
    simulation.set_config(year=year)
    simulation.set_param()
    simulation.set_survey()
    simulation.compute()

    for name, col in simulation.output_table.column_by_name.iteritems():
        print col.name
        print col._dtype
        print col.entity
Beispiel #13
0
def test3():
    year = 2006
    erf = DataCollection(year=year)
    df = erf.get_of_values(table="eec_menage")
    from openfisca_core.simulations import SurveySimulation
    simulation = SurveySimulation()
    simulation.set_config(year=year)
    simulation.set_param()
    simulation.compute()  # TODO: this should not be mandatory
    check_consistency(simulation.input_table, df)
Beispiel #14
0
def survey_case(year):
#        fname = "Agg_%s.%s" %(str(yr), "xls")
    simulation = SurveySimulation()
    simulation.set_config(year = year)
    simulation.set_param()

#    Ignore this
#    inflator = get_loyer_inflator(year)
#    simulation.inflate_survey({'loyer' : inflator})

    simulation.compute()


# Compute aggregates
    agg = Aggregates()
    agg.set_simulation(simulation)
    agg.compute()

    df1 = agg.aggr_frame
    print df1.to_string()

#    Saving aggregates
#    if writer is None:
#        writer = ExcelWriter(str(fname)
#    agg.aggr_frame.to_excel(writer, yr, index= False, header= True)


# Displaying a pivot table
    from openfisca_qt.plugins.survey.distribution import OpenfiscaPivotTable
    pivot_table = OpenfiscaPivotTable()
    pivot_table.set_simulation(simulation)
    df2 = pivot_table.get_table(by = 'so', vars = ['nivvie'])
    print df2.to_string()
Beispiel #15
0
def build_from_openfisca( directory = None):

    df_age_final = None
    for yr in range(2006,2010):
        simulation = SurveySimulation()
        simulation.set_config(year = yr)
        simulation.set_param()
        simulation.set_survey()


        df_age = get_age_structure(simulation)
        df_age[yr] = df_age['wprm']
        del df_age['wprm']
        if df_age_final is None:
            df_age_final = df_age
        else:
            df_age_final = df_age_final.merge(df_age)

    if directory is None:
        directory = os.path.dirname(__file__)

    fname = os.path.join(directory, H5_FILENAME)
    store = HDFStore(fname)
    print df_age_final.dtypes
    store.put("openfisca", df_age_final)
    store.close()
def survey_case(year):

#        fname = "Agg_%s.%s" %(str(yr), "xls")
    simulation = SurveySimulation()
    simulation.set_config(year = year, num_table = 1, reforme = True)
    simulation.set_param()
    simulation.P.ir.autre.charge_loyer.plaf = 500
    simulation.P.ir.autre.charge_loyer.active = 1
    simulation.P.ir.autre.charge_loyer.plaf_nbp = 0

    # plaf=1000 plaf_nbp=0: -42160, =1: -41292
    # plaf=500  plaf_nbp=0: -43033, =1: -42292

    # Bareme threshold reduction in pct
    reduc = .1
    print simulation.P.ir.bareme
    print len(simulation.P.ir.bareme.thresholds)
    for i in range(2, len(simulation.P.ir.bareme.thresholds)):
        simulation.P.ir.bareme.setSeuil(i, simulation.P.ir.bareme.thresholds[i] * (1 - reduc))

    print simulation.P.ir.bareme
    print len(simulation.P.ir.bareme.thresholds)
    simulation.compute()

# Compute aggregates
    agg = Aggregates()
    agg.set_simulation(simulation)
    agg.compute()

    df1 = agg.aggr_frame
    print df1.to_string()
    return
Beispiel #17
0
def survey_case(year):

    yr = str(year)
    #        fname = "Agg_%s.%s" %(str(yr), "xls")
    simulation = SurveySimulation()
    simulation.set_config(year=yr, num_table=1)
    simulation.set_param()

    simulation.compute()

    df = simulation.get_variables_dataframe(variables=["rsa_act"],
                                            entity='ind')
    print df["rsa_act"].describe()

    del simulation
    import gc
    gc.collect()
Beispiel #18
0
def test3():
    year=2006
    erf = DataCollection(year=year)
    df = erf.get_of_values(table = "eec_menage")
    from openfisca_core.simulations import SurveySimulation
    simulation = SurveySimulation()
    simulation.set_config(year=year)
    simulation.set_param()
    simulation.compute() # TODO: this should not be mandatory
    check_consistency(simulation.input_table, df)
def test_gini():
    """
    Compute Gini coefficients
    """
    years = range(2006, 2010)
    for year in years:
        yr = str(year)
        #        fname = "Agg_%s.%s" %(str(yr), "xls")
        simu = SurveySimulation()
        simu.set_config(year=yr)
        simu.set_param()
        inflator = get_loyer_inflator(year)
        simu.inflate_survey({'loyer': inflator})
        simu.compute()

        inequality = Inequality()
        inequality.set_simulation(simu)
        inequality.compute()
        print inequality.inequality_dataframe
        print inequality.poverty
        del simu
Beispiel #20
0
def test():

    for year in range(2006, 2010):
        yr = str(year)
        #        fname = "Agg_%s.%s" %(str(yr), "xls")
        simu = SurveySimulation()
        simu.set_config(year=yr)
        simu.set_param()
        simu.set_survey()
        for var in ["f4ga", "f4gb", "f4gc", "f4ge", "f4gf", "f4gg"]:
            print var
            df = simu.survey.get_value(var)
            print df.max()
            print df.min()
def build_erf_aggregates():
    """
    Fetch the relevant aggregates from erf data
    """
#    Uses rpy2.
#    On MS Windows, The environment variable R_HOME and R_USER should be set
    import pandas.rpy.common as com
    import rpy2.rpy_classic as rpy
    rpy.set_default_mode(rpy.NO_CONVERSION)

    country = 'france'
    for year in range(2006,2008):
        menageXX = "menage" + str(year)[2:]
        menageRdata = menageXX + ".Rdata"
        filename = os.path.join(os.path.dirname(DATA_DIR),'R','erf', str(year), menageRdata)
        yr = str(year)
        simu = SurveySimulation()
        simu.set_config(year = yr, country = country)
        simu.set_param()

        agg = Aggregates()
        agg.set_simulation(simu)
        # print agg.varlist
        rpy.r.load(filename)

        menage = com.load_data(menageXX)
        cols = []
        print year
        for col in agg.varlist:
            #print col
            erf_var = "m_" + col + "m"
            if erf_var in menage.columns:
                cols += [erf_var]

        df = menage[cols]
        wprm = menage["wprm"]
        for col in df.columns:

            tot = (df[col]*wprm).sum()/1e9
            print col, tot
Beispiel #22
0
def survey_case(year):

    yr = str(year)
#        fname = "Agg_%s.%s" %(str(yr), "xls")
    simulation = SurveySimulation()
    simulation.set_config(year = yr, num_table=1)
    simulation.set_param()

    simulation.compute()

    df = simulation.get_variables_dataframe( variables=["rsa_act"], entity='ind')
    print df["rsa_act"].describe()

    del simulation
    import gc
    gc.collect()
def survey_case(year=2006):
    yr = str(year)
    #        fname = "Agg_%s.%s" %(str(yr), "xls")
    simulation = SurveySimulation()
    survey_filename = os.path.join(model.DATA_DIR, 'sources', 'test.h5')
    simulation.set_config(year=yr, survey_filename=survey_filename)
    simulation.set_param()

    #    Ignore this
    #    inflator = get_loyer_inflator(year)
    #    simulation.inflate_survey({'loyer' : inflator})

    simulation.compute()
    simul_out_df = simulation.output_table.table
    simul_in_df = simulation.input_table.table
    print simul_out_df.loc[:,
                           ['af', 'af_base', 'af_forf', 'af_majo', 'af_nbenf'
                            ]].describe()
    print 'input vars'
    print simul_in_df.columns
    print 'output vars'
    print simul_out_df.columns

    #     check_inputs_enumcols(simulation)

    # Compute aggregates
    agg = Aggregates()
    agg.set_simulation(simulation)
    agg.compute()
    df1 = agg.aggr_frame
    print df1.columns

    print df1.to_string()

    #    Saving aggregates
    #    if writer is None:
    #        writer = ExcelWriter(str(fname)
    #    agg.aggr_frame.to_excel(writer, yr, index= False, header= True)

    # Displaying a pivot table
    from openfisca_qt.plugins.survey.distribution import OpenfiscaPivotTable
    pivot_table = OpenfiscaPivotTable()
    pivot_table.set_simulation(simulation)
    df2 = pivot_table.get_table(by='so', vars=['nivvie'])
    print df2.to_string()
    return df1
def test_gini():
    """
    Compute Gini coefficients
    """
    years = range(2006, 2010)
    for year in years:
        yr = str(year)
#        fname = "Agg_%s.%s" %(str(yr), "xls")
        simu = SurveySimulation()
        simu.set_config(year = yr)
        simu.set_param()
        inflator = get_loyer_inflator(year)
        simu.inflate_survey({'loyer' : inflator})
        simu.compute()

        inequality = Inequality()
        inequality.set_simulation(simu)
        inequality.compute()
        print inequality.inequality_dataframe
        print inequality.poverty
        del simu
Beispiel #25
0
def test():

    for year in range(2006,2010):
        yr = str(year)
    #        fname = "Agg_%s.%s" %(str(yr), "xls")
        simu = SurveySimulation()
        simu.set_config(year = yr)
        simu.set_param()
        simu.set_survey()
        for var in ["f4ga", "f4gb", "f4gc", "f4ge", "f4gf", "f4gg"]:
            print var
            df = simu.survey.get_value(var)
            print df.max()
            print df.min()
Beispiel #26
0
def survey_case_3_tables():
    year = 2006
    yr = str(year)
    simulation = SurveySimulation()

    survey_input = HDFStore(survey3_test)
    #     convert_to_3_tables(year=year, survey_file=survey_file, output_file=survey3_file)
    df_men = survey_input['survey_2006/men']
    df_foy = survey_input['survey_2006/foy']
    df_fam = survey_input['survey_2006/fam']
    df_fam['alr'] = 0
    survey_input['survey_2006/fam'] = df_fam

    simulation.num_table = 3

    simulation.set_config(year=yr, survey_filename=survey3_test)
    simulation.set_param()

    simulation.compute()

    # Compute aggregates
    agg = Aggregates()
    agg.set_simulation(simulation)
    agg.compute()

    df1 = agg.aggr_frame
    print df1.to_string()

    # #    Saving aggregates
    #     if writer is None:
    #         writer = ExcelWriter(str(fname)
    #     agg.aggr_frame.to_excel(writer, yr, index= False, header= True)

    # Displaying a pivot table
    from openfisca_qt.plugins.survey.distribution import OpenfiscaPivotTable
    pivot_table = OpenfiscaPivotTable()
    pivot_table.set_simulation(simulation)
    df2 = pivot_table.get_table(by='so', vars=['nivvie'])
    print df2.to_string()
Beispiel #27
0
    def _generate_aggregates(self):
        dfs = list()
        dfs_erf = list()
        years = self.years
        for year in years:
            # Running a standard SurveySimulation to get OF aggregates
            simulation = SurveySimulation()
            survey_filename = self.survey_filename
            simulation.set_config(year=year, survey_filename=survey_filename)
            simulation.set_param()
            simulation.compute()
            agg = Aggregates()
            agg.set_simulation(simulation)
            agg.compute()
            df = agg.aggr_frame
            df['year'] = year
            label_by_name = dict(
                (name, column.label) for name, column in
                simulation.output_table.column_by_name.iteritems())
            #colonnes = simulation.output_table.table.columns
            dfs.append(df)
            variables = agg.varlist
            labels_variables = [
                label_by_name[variable] for variable in variables
            ]
            del simulation, agg, df
            # simulation.save_content(name, filename)

            gc.collect()

            # ERFS
            temp = (build_erf_aggregates(variables=variables, year=year))
            temp.rename(columns=label_by_name, inplace=True)
            temp = temp.T
            temp.reset_index(inplace=True)
            temp['year'] = year
            dfs_erf.append(temp)
            del temp
            gc.collect()

        self.labels_variables = labels_variables
        self.aggregates_of_dataframe = dfs
        self.aggregates_erfs_dataframe = dfs_erf
Beispiel #28
0
def survey_case_3_tables():
    year = 2006
    yr = str(year)
    simulation = SurveySimulation()


    survey_input = HDFStore(survey3_test)
#     convert_to_3_tables(year=year, survey_file=survey_file, output_file=survey3_file)
    df_men = survey_input['survey_2006/men']
    df_foy = survey_input['survey_2006/foy']
    df_fam = survey_input['survey_2006/fam']
    df_fam['alr'] = 0;
    survey_input['survey_2006/fam'] = df_fam

    simulation.num_table = 3

    simulation.set_config(year = yr, survey_filename=survey3_test)
    simulation.set_param()

    simulation.compute()

# Compute aggregates
    agg = Aggregates()
    agg.set_simulation(simulation)
    agg.compute()

    df1 = agg.aggr_frame
    print df1.to_string()

# #    Saving aggregates
#     if writer is None:
#         writer = ExcelWriter(str(fname)
#     agg.aggr_frame.to_excel(writer, yr, index= False, header= True)


# Displaying a pivot table
    from openfisca_qt.plugins.survey.distribution import OpenfiscaPivotTable
    pivot_table = OpenfiscaPivotTable()
    pivot_table.set_simulation(simulation)
    df2 = pivot_table.get_table(by ='so', vars=['nivvie'])
    print df2.to_string()
Beispiel #29
0
def survey_case(year = 2006):
    yr = str(year)
#        fname = "Agg_%s.%s" %(str(yr), "xls")
    simulation = SurveySimulation()
    survey_filename = os.path.join(model.DATA_DIR, 'sources', 'test.h5')
    simulation.set_config(year=yr, survey_filename=survey_filename)
    simulation.set_param()


#    Ignore this
#    inflator = get_loyer_inflator(year)
#    simulation.inflate_survey({'loyer' : inflator})

    simulation.compute()
    simul_out_df = simulation.output_table.table
    simul_in_df = simulation.input_table.table
    print simul_out_df.loc[:,['af', 'af_base', 'af_forf', 'af_majo', 'af_nbenf']].describe()
    print 'input vars'
    print simul_in_df.columns
    print 'output vars'
    print simul_out_df.columns

#     check_inputs_enumcols(simulation)

# Compute aggregates
    agg = Aggregates()
    agg.set_simulation(simulation)
    agg.compute()
    df1 = agg.aggr_frame
    print df1.columns

    print df1.to_string()

#    Saving aggregates
#    if writer is None:
#        writer = ExcelWriter(str(fname)
#    agg.aggr_frame.to_excel(writer, yr, index= False, header= True)


# Displaying a pivot table
    from openfisca_qt.plugins.survey.distribution import OpenfiscaPivotTable
    pivot_table = OpenfiscaPivotTable()
    pivot_table.set_simulation(simulation)
    df2 = pivot_table.get_table(by ='so', vars=['nivvie'])
    print df2.to_string()
    return df1
Beispiel #30
0
    def _generate_aggregates(self):
        dfs = list()
        dfs_erf = list()
        years = self.years
        for year in years:
            # Running a standard SurveySimulation to get OF aggregates
            simulation = SurveySimulation()
            survey_filename = self.survey_filename
            simulation.set_config(year=year, survey_filename=survey_filename)
            simulation.set_param()
            simulation.compute()
            agg = Aggregates()
            agg.set_simulation(simulation)
            agg.compute()
            df = agg.aggr_frame
            df['year'] = year
            label_by_name = dict(
                (name, column.label)
                for name, column in simulation.output_table.column_by_name.iteritems()
                )
            #colonnes = simulation.output_table.table.columns
            dfs.append(df)
            variables = agg.varlist
            labels_variables = [
                label_by_name[variable]
                for variable in variables
                ]
            del simulation, agg, df
            # simulation.save_content(name, filename)

            gc.collect()

            # ERFS
            temp = (build_erf_aggregates(variables=variables, year= year))
            temp.rename(columns = label_by_name, inplace = True)
            temp = temp.T
            temp.reset_index(inplace = True)
            temp['year'] = year
            dfs_erf.append(temp)
            del temp
            gc.collect()

        self.labels_variables = labels_variables
        self.aggregates_of_dataframe = dfs
        self.aggregates_erfs_dataframe = dfs_erf
def test():
    country = "france"
    for year in range(2006, 2010):

        yr = str(year)
        simu = SurveySimulation()
        simu.set_config(year=yr, country=country)
        simu.set_param()

        agg = Aggregates()
        agg.set_simulation(simu)

        for col in agg.varlist:
            print col
Beispiel #32
0
def build_erf_aggregates():
    """
    Fetch the relevant aggregates from erf data
    """
    #    Uses rpy2.
    #    On MS Windows, The environment variable R_HOME and R_USER should be set
    import pandas.rpy.common as com
    import rpy2.rpy_classic as rpy
    rpy.set_default_mode(rpy.NO_CONVERSION)

    country = 'france'
    for year in range(2006, 2008):
        menageXX = "menage" + str(year)[2:]
        menageRdata = menageXX + ".Rdata"
        filename = os.path.join(os.path.dirname(DATA_DIR), 'R', 'erf',
                                str(year), menageRdata)
        yr = str(year)
        simu = SurveySimulation()
        simu.set_config(year=yr, country=country)
        simu.set_param()

        agg = Aggregates()
        agg.set_simulation(simu)
        # print agg.varlist
        rpy.r.load(filename)

        menage = com.load_data(menageXX)
        cols = []
        print year
        for col in agg.varlist:
            #print col
            erf_var = "m_" + col + "m"
            if erf_var in menage.columns:
                cols += [erf_var]

        df = menage[cols]
        wprm = menage["wprm"]
        for col in df.columns:

            tot = (df[col] * wprm).sum() / 1e9
            print col, tot
Beispiel #33
0
def test(year=2006, variables=['af']):
    simulation = SurveySimulation()
    survey_filename = os.path.join(model.DATA_DIR, 'sources', 'test.h5')
    simulation.set_config(year=year, survey_filename=survey_filename)
    simulation.set_param()
    simulation.compute()

    #     of_aggregates = Aggregates()
    #     of_aggregates.set_simulation(simulation)
    #     of_aggregates.compute()
    #     print of_aggregates.aggr_frame
    #
    #     from openfisca_france.data.erf.aggregates import build_erf_aggregates
    #     temp = (build_erf_aggregates(variables=variables, year= year))
    #     print temp
    #     return
    variable = "af"
    debugger = Debugger()
    debugger.set_simulation(simulation)
    debugger.set_variable(variable)
    debugger.show_aggregates()

    def get_all_ancestors(varlist):
        if len(varlist) == 0:
            return []
        else:
            if varlist[0]._parents == set():
                return ([varlist[0]] + get_all_ancestors(varlist[1:]))
            else:
                return ([varlist[0]] +
                        get_all_ancestors(list(varlist[0]._parents)) +
                        get_all_ancestors(varlist[1:]))

    # We want to get all ancestors + children + the options that we're going to encounter
    parents = map(lambda x: simulation.output_table.column_by_name.get(x),
                  variables)
    parents = get_all_ancestors(parents)
    options = []
    for varcol in parents:
        options.extend(varcol._option.keys())
    options = list(set(options))
    #print options
    parents = map(lambda x: x.name, parents)
    for var in variables:
        children = set()
        varcol = simulation.output_table.column_by_name.get(var)
        children = children.union(set(map(lambda x: x.name, varcol._children)))
    variables = list(set(parents + list(children)))
    #print variables
    del parents, children
    gc.collect()

    def get_var(variable):
        variables = [variable]
        return simulation.aggregated_by_entity(entity="men",
                                               variables=variables,
                                               all_output_vars=False,
                                               force_sum=True)[0]

    simu_aggr_tables = get_var(variables[0])
    for var in variables[1:]:
        simu_aggr_tables = simu_aggr_tables.merge(get_var(var)[['idmen', var]],
                                                  on='idmen',
                                                  how='outer')

    # We load the data from erf table in case we have to pick data there
    erf_data = DataCollection(year=year)
    os.system('cls')
    todo = set(variables + ["ident", "wprm"]).union(set(options))
    print 'Variables or equivalents to fetch :'
    print todo
    '''
    Méthode générale pour aller chercher les variables de l'erf/eec
    ( qui n'ont pas forcément le même nom
    et parfois sont les variables utilisées pour créér l'of ):
    1 - essayer le get_of2erf, ça doit marcher pour les variables principales ( au moins les aggrégats
    que l'on compare )
    Si les variables ne sont pas directement dans la table,
    elles ont été calculées à partir d'autres variables de données erf/eec
    donc chercher dans :
    2 - build_survey
    3 - model/model.py qui dira éventuellement dans quel module de model/ chercher
    Le 'print todo' vous indique quelles variables chercher
    ( attention à ne pas inclure les enfants directs )
    L'utilisation du Ctrl-H est profitable !
    '''

    fetch_eec = [
        'statut', 'titc', 'chpub', 'encadr', 'prosa', 'age', 'naim', 'naia',
        'noindiv'
    ]
    fetch_erf = ['zsali', 'af', 'ident', 'wprm', 'noi', 'noindiv', 'quelfic']
    erf_df = erf_data.get_of_values(variables=fetch_erf, table="erf_indivi")
    eec_df = erf_data.get_of_values(variables=fetch_eec, table="eec_indivi")
    erf_eec_indivi = erf_df.merge(eec_df, on='noindiv', how='inner')
    assert 'quelfic' in erf_eec_indivi.columns, "quelfic not in erf_indivi columns"
    del eec_df, erf_df

    # We then get the aggregate variables for the menage ( mainly to compare with of )
    print 'Loading data from erf_menage table'
    erf_menage = erf_data.get_of_values(variables=list(todo) + ['quelfic'],
                                        table="erf_menage")

    del todo
    gc.collect()
    assert 'ident' in erf_menage.columns, "ident not in erf_menage.columns"

    from openfisca_france.data.erf import get_erf2of
    erf2of = get_erf2of()
    erf_menage.rename(columns=erf2of, inplace=True)

    # We get the options from the simulation non aggregated tables:

    # First from the output_table
    # We recreate the noindiv in output_table
    simulation.output_table.table[
        'noindiv'] = 100 * simulation.output_table.table.idmen_ind + simulation.output_table.table.noi_ind
    simulation.output_table.table['noindiv'] = simulation.output_table.table[
        'noindiv'].astype(np.int64)
    s1 = [
        var for var in set(options).intersection(
            set(simulation.output_table.table.columns))
    ] + ['idmen_ind', 'quimen_ind', 'noindiv']
    simu_nonaggr_tables = (simulation.output_table.table)[s1]
    simu_nonaggr_tables.rename(columns={
        'idmen_ind': 'idmen',
        'quimen_ind': 'quimen'
    },
                               inplace=True)
    assert 'noindiv' in simu_nonaggr_tables.columns

    # If not found, we dwelve into the input_table
    if (set(s1) - set(['idmen_ind', 'quimen_ind', 'noindiv'])) < set(options):
        assert 'noindiv' in simulation.input_table.table.columns, "'noindiv' not in simulation.input_table.table.columns"
        s2 = [
            var for var in (set(options).intersection(
                set(simulation.input_table.table.columns)) - set(s1))
        ] + ['noindiv']
        #print s2
        temp = simulation.input_table.table[s2]
        simu_nonaggr_tables = simu_nonaggr_tables.merge(temp,
                                                        on='noindiv',
                                                        how='inner',
                                                        sort=False)

        del s2, temp
    del s1
    gc.collect()

    simu_nonaggr_tables = simu_nonaggr_tables[list(set(options)) +
                                              ['idmen', 'quimen', 'noindiv']]
    #print options, variables
    assert 'idmen' in simu_nonaggr_tables.columns, 'Idmen not in simu_nonaggr_tables columns'

    # Check the idmens that are not common
    erf_menage.rename(columns={'ident': 'idmen'}, inplace=True)

    print "\n"
    print 'Checking if idmen is here...'
    print '\n ERF : '
    print 'idmen' in erf_menage.columns
    print "\n Simulation output"
    print 'idmen' in simu_aggr_tables.columns
    print "\n"

    #print 'Dropping duplicates of idmen for both tables...'
    assert not erf_menage["idmen"].duplicated().any(
    ), "Duplicated idmen in erf_menage"
    #erf_menage.drop_duplicates('idmen', inplace = True)
    simu_aggr_tables.drop_duplicates('idmen', inplace=True)
    assert not simu_aggr_tables["idmen"].duplicated().any(
    ), "Duplicated idmen in of"

    print 'Checking mismatching idmen... '
    s1 = set(erf_menage['idmen']) - (set(simu_aggr_tables['idmen']))
    if s1:
        print "idmen that aren't in simu_aggr_tables : %s" % str(len(s1))
        pass
    s2 = (set(simu_aggr_tables['idmen'])) - set(erf_menage['idmen'])
    if s2:
        print "idmen that aren't in erf_menage : %s" % str(len(s2))
        pass
    del s1, s2

    # Restrict to common idmens and merge
    s3 = set(erf_menage['idmen']).intersection(set(simu_aggr_tables['idmen']))
    print "Restricting to %s common idmen... \n" % str(len(s3))
    erf_menage = erf_menage[erf_menage['idmen'].isin(s3)]
    simu_aggr_tables = simu_aggr_tables[simu_aggr_tables['idmen'].isin(s3)]
    del s3
    gc.collect()

    #print erf_menage.columns
    #print simu_aggr_tables.columns

    # Compare differences across of and erf dataframes
    print "Comparing differences between dataframes... \n"
    colcom = (set(erf_menage.columns).intersection(
        set(simu_aggr_tables.columns))) - set(['idmen', 'wprm'])
    print 'Common variables: '
    print colcom
    erf_menage.reset_index(inplace=True)
    simu_aggr_tables.reset_index(inplace=True)
    for col in colcom:
        temp = set(
            erf_menage['idmen'][erf_menage[col] != simu_aggr_tables[col]])
        print "Numbers of idmen that aren't equal on variable %s : %s \n" % (
            col, str(len(temp)))
        del temp

    # Detect the biggest differences
    bigtable = merge(erf_menage,
                     simu_aggr_tables,
                     on='idmen',
                     how='inner',
                     suffixes=('_erf', '_of'))
    print 'Length of new dataframe is %s' % str(len(bigtable))
    #print bigtable.columns
    bigtable.set_index('idmen', drop=False, inplace=True)

    already_met = []
    options_met = []

    for col in colcom:
        bigtemp = None
        table = bigtable[and_(bigtable[col + '_erf'] != 0,
                              bigtable[col + '_of'] != 0)]
        table[col] = (table[col + '_erf'] - table[col + '_of']
                      ) / table[col + '_erf']  #Difference relative
        table[col] = table[col].apply(lambda x: abs(x))
        print 'Minimum difference between the two tables for %s is %s' % (
            col, str(table[col].min()))
        print 'Maximum difference between the two tables for %s is %s' % (
            col, str(table[col].max()))
        print table[col].describe()
        try:
            assert len(table[col]) == len(table['wprm_of']), "PINAGS"
            dec, values = mwp(table[col],
                              np.arange(1, 11),
                              table['wprm_of'],
                              2,
                              return_quantiles=True)
            #print sorted(values)
            dec, values = mwp(table[col],
                              np.arange(1, 101),
                              table['wprm_erf'],
                              2,
                              return_quantiles=True)
            #print sorted(values)[90:]
            del dec, values
            gc.collect()
        except:
            #print 'Weighted percentile method didnt work for %s' %col
            pass
        print "\n"

        # Show the relevant information for the most deviant households
        table.sort(columns=col, ascending=False, inplace=True)
        #print table[col][0:10].to_string()
        if bigtemp is None:
            bigtemp = {
                'table': table[[col, col + '_of', col + '_erf',
                                'idmen']][0:10],
                'options': None
            }
        bigtemp['table'][col + 'div'] = bigtemp['table'][
            col + '_of'] / bigtemp['table'][col + '_erf']
        print bigtemp['table'].to_string()
        '''
        bigtemp is the table which will get filled little by little by the relevant variables.
        Up to the last rows of code 'table' refers to a table of aggregated values,
        while 'options is a table of individual variables.
        The reason we call it in a dictionnary is also because we modify it inside the recursive function 'iter_on parents',
        and it causes an error in Python unless for certain types like dictionnary values.
        '''
        #print "\n"

        # If variable is a Prestation, we show the dependancies
        varcol = simulation.output_table.column_by_name.get(col)
        if isinstance(varcol, Prestation):
            '''
            For the direct children
            '''
            if not varcol._children is None:
                ch_to_fetch = list(varcol._children)
                ch_to_fetch = map(lambda x: x.name, ch_to_fetch)
                ch_fetched = []

                if set(ch_to_fetch) <= set(simu_aggr_tables.columns):
                    print "Variables which need %s to be computed :\n %s \n" % (
                        col, str(ch_to_fetch))
                    for var in ch_to_fetch:
                        if var + '_of' in table.columns:
                            ch_fetched.append(var + '_of')
                        else:
                            ch_fetched.append(var)
                elif set(ch_to_fetch) <= set(simu_aggr_tables.columns).union(
                        erf_menage.columns):
                    print "Variables which need %s to be computed (some missing picked in erf):\n %s \n" % (
                        col, str(ch_to_fetch))
                    for var in ch_to_fetch:
                        if var in simu_aggr_tables.columns:
                            if var + '_of' in table.columns:
                                ch_fetched.append(var + '_of')
                        elif var + '_erf' in table.columns:
                            ch_fetched.append(var + '_erf')
                        else:
                            ch_fetched.append(var)
                else:
                    print "Variables which need %s to be computed (some missing):\n %s \n" % (
                        col, str(ch_to_fetch))
                    for var in ch_to_fetch:

                        if var in simu_aggr_tables.columns:
                            if var + '_of' in table.columns:
                                ch_fetched.append(var + '_of')
                        elif var in erf_menage.columns:
                            if var + '_erf' in table.columns:
                                ch_fetched.append(var + '_erf')

                print table[[col] + ch_fetched][0:10]
                print "\n"
                del ch_to_fetch, ch_fetched
            '''
            For the parents
            '''
            def iter_on_parents(varcol):
                if (varcol._parents == set() and varcol._option
                        == {}) or varcol.name in already_met:
                    return
                else:
                    par_to_fetch = list(varcol._parents)
                    par_to_fetch = map(lambda x: x.name, par_to_fetch)
                    par_fetched = []

                    if set(par_fetched) <= set(simu_aggr_tables.columns):
                        #print "Variables the prestation %s depends of :\n %s \n" %(varcol.name, str(par_fetched))
                        for var in par_fetched:
                            if var + '_of' in table.columns:
                                par_fetched.append(var + '_of')
                            else:
                                par_fetched.append(var)
                    elif set(par_fetched) <= set(
                            simu_aggr_tables.columns).union(
                                erf_menage.columns):
                        #print "Variables the prestation %s depends of (some missing picked in erf):\n %s \n" %(varcol.name,str(par_fetched))
                        for var in par_fetched:
                            if var in simu_aggr_tables.columns:
                                if var + '_of' in table.columns:
                                    par_fetched.append(var + '_of')
                            elif var + '_erf' in table.columns:
                                par_fetched.append(var + '_erf')
                            else:
                                par_fetched.append(var)
                    else:
                        for var in par_fetched:
                            if var in simu_aggr_tables.columns:
                                if var + '_of' in table.columns:
                                    par_fetched.append(var + '_of')
                            elif var in erf_menage.columns:
                                if var + '_erf' in table.columns:
                                    par_fetched.append(var + '_erf')
                        if len(par_fetched) > 0:
                            #print "Variables the prestation %s depends of (some missing):\n %s \n" %(varcol.name, str(par_fetched))
                            pass
                        else:
                            #print "Variables the prestation %s depends of couldn't be found :\n %s \n" %(varcol.name, str(par_fetched))
                            pass

                    if len(par_fetched) > 0:
                        temp = table[[col, 'idmen'] + par_fetched][0:10]
                        bigtemp['table'] = pd.merge(temp,
                                                    bigtemp['table'],
                                                    how='inner')
                        #print temp.to_string(), "\n"
                    if varcol._option != {} and not set(
                            varcol._option.keys()) < set(options_met):
                        vars_to_fetch = list(
                            set(varcol._option.keys()) - set(options_met))
                        #print "and the options to current variable %s for the id's with strongest difference :\n %s \n" %(varcol.name, varcol._option.keys())
                        liste = [i for i in range(0, 10)]
                        liste = map(lambda x: table['idmen'].iloc[x], liste)
                        temp = simu_nonaggr_tables[
                            ['idmen', 'quimen', 'noindiv'] +
                            vars_to_fetch][simu_nonaggr_tables['idmen'].isin(
                                table['idmen'][0:10])]

                        temp_sorted = temp[temp['idmen'] == liste[0]]
                        for i in xrange(1, 10):
                            temp_sorted = temp_sorted.append(
                                temp[temp['idmen'] == liste[i]])
                        if bigtemp['options'] is None:
                            bigtemp['options'] = temp_sorted
                            bigtemp['options'] = bigtemp['options'].merge(
                                erf_eec_indivi, on='noindiv', how='outer')
                        else:
                            bigtemp['options'] = bigtemp['options'].merge(
                                temp_sorted,
                                on=['noindiv', 'idmen', 'quimen'],
                                how='outer')


#                         temp_sorted.set_index(['idmen',  'quimen'], drop = True, inplace = True) # If we do that
                        del temp, temp_sorted
                        gc.collect()

                    already_met.append(varcol.name)
                    options_met.extend(varcol._option.keys())
                    for var in varcol._parents:
                        iter_on_parents(var)

            iter_on_parents(varcol)
            # We merge the aggregate table with the option table ( for each individual in entity )
            bigtemp['table'] = bigtemp['table'].merge(bigtemp['options'],
                                                      how='left',
                                                      on='idmen',
                                                      suffixes=('(agg)',
                                                                '(ind)'))

            # Reshaping the table to group by descending error on col, common entities
            bigtemp['table'].sort(columns=['af', 'quimen'],
                                  ascending=[False, True],
                                  inplace=True)
            bigtemp['table'] = bigtemp['table'].groupby(['idmen', 'quimen'],
                                                        sort=False).sum()
            print "Table of values for %s dependencies : \n" % col
            print bigtemp['table'].to_string()
            del bigtemp['table'], bigtemp['options']
            gc.collect()
Beispiel #34
0
            print "Table of values for %s dependencies : \n" % col
            print bigtemp['table'].to_string()
            del bigtemp['table'], bigtemp['options']
            gc.collect()

if __name__ == '__main__':

    restart = True
    survey = 'survey.h5'
    save_path = os.path.join(model.DATA_DIR, 'erf')
    saved_simulation_filename = os.path.join(save_path,
                                             'debugger_' + survey[:-3])

    if restart:
        year = 2006
        simulation = SurveySimulation()
        if survey == 'survey.h5':
            survey_filename = os.path.join(model.DATA_DIR, survey)
        else:
            survey_filename = os.path.join(model.DATA_DIR, 'sources', survey)

        simulation.set_config(year=year, survey_filename=survey_filename)
        simulation.set_param()
        simulation.compute()
        simulation.save_content('debug', saved_simulation_filename)

    deb = Debugger()
    deb.set_simulation(name='debug', filename=saved_simulation_filename)
    deb.set_variable('af')
    deb.show_aggregates()
    deb.preproc()
from openfisca_core import model
from openfisca_core.simulations import SurveySimulation


filename = os.path.join(model.DATA_DIR, 'survey3.h5')

num_table = 3


input = HDFStore(filename)
survey = tables.openFile(destination_dir+"survey3.h5", mode = "w")

years = ['2006']

for yr in years:
    simu = SurveySimulation()
    simu.set_config(year = yr)
    simu.set_param()
    simu.set_survey(num_table=num_table)
    survey_year = survey.createGroup("/", "survey_"+yr,"year")
    if num_table == 3:
        for ent in ['ind','men','foy','fam']:
            tab = simu.survey.table3[ent]
            tab_type = tab.to_records(index=False).dtype
            survey_table = survey.createTable('/survey_'+yr,ent,tab_type)
            survey_table.append(tab.to_records(index=False))
            survey_table.flush()
    if num_table == 1:
        tab = simu.survey.table
        tab_type = tab.to_records(index=False).dtype
        to_remote = ['opt_colca','quelfic']
def test():
    from openfisca_core.simulations import SurveySimulation
    from .aggregates import Aggregates

    yr = 2006
    simulation = SurveySimulation()
    simulation.set_config(year = yr)
    simulation.set_param()
    simulation.set_survey()

    calibration = Calibration()
    calibration.set_simulation(simulation)

    filename = "../../countries/france/calibrations/calib_2006.csv"
    calibration.set_inputs_margins_from_file(filename, 2006)

    calibration.set_param('invlo', 3)
    calibration.set_param('up', 3)
    calibration.set_param('method', 'logit')

    aggregates = Aggregates()
    aggregates.set_simulation(simulation)
    simulation.compute()
    aggregates.compute()

    print aggregates.aggr_frame.to_string()

    calibration.set_calibrated_weights()
    simulation.compute()
    aggregates.compute()

    print aggregates.aggr_frame.to_string()
def check_survey(year=2013):
    simulation = SurveySimulation()
    simulation.set_config(year=year)
    simulation.set_param()
    simulation.compute()
Beispiel #38
0

import nose

from openfisca_core.simulations import SurveySimulation
from openfisca_qt.scripts.validation.check_consistency_tests import check_inputs_enumcols, check_entities, check_weights


# Validation
# Should ideally produce a log file
# Try to be the most country/model
# agnostic (so part of the general stuff could be elsewhere
# Proceed using import from separate file in validation

year = 2006
simulation = SurveySimulation()
simulation.set_config(year = year)
simulation.set_param()
simulation.set_survey()


# Pre-computation validation
#


def test_inputs_consistency():
    """
    Test consistency of inputs data
    """

#    check that the Enumcols are right (and fix the labels/the original data)
Beispiel #39
0
def test():
    from openfisca_core.simulations import SurveySimulation
    from .aggregates import Aggregates

    yr = 2006
    simulation = SurveySimulation()
    simulation.set_config(year = yr)
    simulation.set_param()
    simulation.set_survey()

    calibration = Calibration()
    calibration.set_simulation(simulation)

    filename = "../../countries/france/calibrations/calib_2006.csv"
    calibration.set_inputs_margins_from_file(filename, 2006)

    calibration.set_param('invlo', 3)
    calibration.set_param('up', 3)
    calibration.set_param('method', 'logit')

    aggregates = Aggregates()
    aggregates.set_simulation(simulation)
    simulation.compute()
    aggregates.compute()

    print aggregates.aggr_frame.to_string()

    calibration.set_calibrated_weights()
    simulation.compute()
    aggregates.compute()

    print aggregates.aggr_frame.to_string()
def test_laurence():
    '''
    Computes the openfisca/real numbers comparaison table in excel worksheet.

    Warning: To add more years you'll have to twitch the code manually.
    Default is years 2006 to 2009 included.
    '''
    def save_as_xls(df, alter_method=True):
        # Saves a datatable under Excel table using XLtable
        if alter_method:
            filename = "C:\desindexation.xls"
            print filename
            writer = ExcelWriter(str(filename))
            df.to_excel(writer)
            writer.save()
        else:
            # XLtable utile pour la mise en couleurs, reliefs, etc. de la table, inutile sinon
            stxl = XLtable(df)
            # <========== HERE TO CHANGE OVERLAY ======>
            wb = xlwt.Workbook()
            ws = wb.add_sheet('resultatstest')
            erfxcel = stxl.place_table(ws)
            try:  # I dunno more clever commands
                wb.save("C:\outputtest.xls")
            except:
                n = random.randint(0, 100)
                wb.save("C:\outputtest_" + str(n) + ".xls")

#===============================================================================
#     from numpy.random import randn
#     mesures = ['cotsoc','af', 'add', 'cotsoc','af', 'add', 'cotsoc','af', 'add',
#                'cotsoc','af', 'add', 'cotsoc','af', 'add', 'cotsoc','af', 'add',
#                'cotsoc','af', 'add', 'cotsoc','af', 'add', 'cotsoc','af', 'add']
#     sources = ['of', 'of', 'of', 'erfs', 'erfs', 'erfs', 'reel', 'reel', 'reel',
#                'of', 'of', 'of', 'erfs', 'erfs', 'erfs', 'reel', 'reel', 'reel',
#                'of', 'of', 'of', 'erfs', 'erfs', 'erfs', 'reel', 'reel', 'reel']
#     year = ['2006', '2006', '2006', '2006', '2006', '2006', '2006', '2006', '2006',
#             '2007', '2007', '2007', '2007', '2007', '2007', '2007', '2007', '2007',
#             '2008', '2008', '2008', '2008', '2008', '2008', '2008', '2008', '2008']
#     ind = zip(*[mesures,sources, year])
# #     print ind
#     from pandas.core.index import MultiIndex
#     ind = MultiIndex.from_tuples(ind, names = ['mesure', 'source', 'year'])
# #     print ind
#     d = pd.DataFrame(randn(27,2), columns = ['Depenses', 'Recettes'], index = ind)
#     d.reset_index(inplace = True, drop = False)
#     d = d.groupby(by = ['mesure', 'source', 'year'], sort = False).sum()
#     print d
#     d_unstacked = d.unstack()
#     print d
#     indtemp1 = d.index.get_level_values(0)
#     indtemp2 = d.index.get_level_values(1)
#     indexi = zip(*[indtemp1, indtemp2])
#     print indexi
#     indexi_bis = []
#     for i in xrange(len(indexi)):
#         if indexi[i] not in indexi_bis:
#             indexi_bis.append(indexi[i])
#     indexi = indexi_bis
#     indexi = MultiIndex.from_tuples(indexi, names = ['Mesure', 'source'])
#     print indexi
#     d_unstacked = d_unstacked.reindex_axis(indexi, axis = 0)
#     print d_unstacked.to_string()
#     save_as_xls(d_unstacked)
#     return
#===============================================================================

    def reshape_tables(dfs, dfs_erf):
        agg = Aggregates()

        # We need this for the columns labels to work

        print 'Resetting index to avoid later trouble on manipulation'
        for d in dfs:
            d.reset_index(inplace=True)
            d.set_index('Mesure', inplace=True, drop=False)
            d.reindex_axis(labels_variables, axis=0)
            d.reset_index(inplace=True, drop=True)
#             print d.to_string()
        for d in dfs_erf:
            d.reset_index(inplace=True)
            d['Mesure'] = agg.labels['dep']
            d.set_index('index', inplace=True, drop=False)
            d.reindex_axis(agg.labels.values(), axis=0)
            d.reset_index(inplace=True, drop=True)


#             print d.to_string()

# Concatening the openfisca tables for =/= years
        temp = pd.concat([dfs[0], dfs[1]], ignore_index=True)
        temp = pd.concat([temp, dfs[2]], ignore_index=True)
        temp = pd.concat([temp, dfs[3]], ignore_index=True)
        del temp[agg.labels['entity']], temp['index']
        gc.collect()

        print 'We split the real aggregates from the of table'
        temp2 = temp[[
            agg.labels['var'], agg.labels['benef_real'],
            agg.labels['dep_real'], 'year'
        ]]
        del temp[agg.labels['benef_real']], temp[agg.labels['dep_real']]
        temp['source'] = 'of'
        temp2['source'] = 'reel'
        temp2.rename(columns={
            agg.labels['benef_real']: agg.labels['benef'],
            agg.labels['dep_real']: agg.labels['dep']
        },
                     inplace=True)
        temp = pd.concat([temp, temp2], ignore_index=True)

        print 'We add the erf data to the table'
        for df in dfs_erf:
            del df['level_0'], df['Mesure']
            df.rename(columns={
                'index': agg.labels['var'],
                1: agg.labels['dep']
            },
                      inplace=True)
        temp3 = pd.concat([dfs_erf[0], dfs_erf[1]], ignore_index=True)
        temp3 = pd.concat([temp3, dfs_erf[2]], ignore_index=True)
        temp3 = pd.concat([temp3, dfs_erf[3]], ignore_index=True)
        temp3['source'] = 'erfs'
        gc.collect()
        temp = pd.concat([temp, temp3], ignore_index=True)
        #         print temp.to_string()

        print 'Index manipulation to reshape the output'
        temp.reset_index(drop=True, inplace=True)
        # We set the new index
        #         temp.set_index('Mesure', drop = True, inplace = True)
        #         temp.set_index('source', drop = True, append = True, inplace = True)
        #         temp.set_index('year', drop = False, append = True, inplace = True)
        temp = temp.groupby(by=["Mesure", "source", "year"], sort=False).sum()
        # Tricky, the [mesure, source, year] index is unique so sum() will return the only value
        # Groupby automatically deleted the source, mesure... columns and added them to index
        assert (isinstance(temp, pd.DataFrame))
        #         print temp.to_string()

        # We want the years to be in columns, so we use unstack
        temp_unstacked = temp.unstack()
        # Unfortunately, unstack automatically sorts rows and columns, we have to reindex the table :

        ## Reindexing rows
        from pandas.core.index import MultiIndex
        indtemp1 = temp.index.get_level_values(0)
        indtemp2 = temp.index.get_level_values(1)
        indexi = zip(*[indtemp1, indtemp2])
        indexi_bis = []
        for i in xrange(0, len(indexi)):
            if indexi[i] not in indexi_bis:
                indexi_bis.append(indexi[i])
        indexi = indexi_bis
        del indexi_bis
        indexi = MultiIndex.from_tuples(indexi, names=['Mesure', 'source'])
        #         import pdb
        #         pdb.set_trace()
        temp_unstacked = temp_unstacked.reindex_axis(
            indexi, axis=0)  # axis = 0 for rows, 1 for columns

        ## Reindexing columns
        # TODO : still not working
        col_indexi = []
        for col in temp.columns.get_level_values(0).unique():
            for yr in range(2006, 2010):
                col_indexi.append((col, str(yr)))
        col_indexi = MultiIndex.from_tuples(col_indexi)
        #         print col_indexi
        #         print temp_unstacked.columns
        print col_indexi
        #         temp_unstacked = temp_unstacked.reindex_axis(col_indexi, axis = 1)

        # Our table is ready to be turned to Excel worksheet !
        print temp_unstacked.to_string()
        temp_unstacked.fillna(0, inplace=True)
        return temp_unstacked

    dfs = []
    dfs_erf = []
    for i in range(2006, 2010):
        year = i
        yr = str(i)
        # Running a standard SurveySim to get aggregates
        simulation = SurveySimulation()
        survey_filename = os.path.join(model.DATA_DIR, 'sources', 'test.h5')
        simulation.set_config(year=yr, survey_filename=survey_filename)
        simulation.set_param()
        simulation.compute()
        agg = Aggregates()
        agg.set_simulation(simulation)
        agg.compute()
        df = agg.aggr_frame
        df['year'] = year
        label_by_name = dict(
            (name, column.label) for name, column in
            simulation.output_table.column_by_name.iteritems())
        #colonnes = simulation.output_table.table.columns
        dfs.append(df)
        variables = agg.varlist
        labels_variables = [label_by_name[variable] for variable in variables]
        del simulation, agg, df
        gc.collect()

        #Getting ERF aggregates from ERF table
        temp = build_erf_aggregates(variables=variables, year=year)
        temp.rename(columns=label_by_name, inplace=True)
        temp = temp.T
        temp.reset_index(inplace=True)
        temp['year'] = year
        dfs_erf.append(temp)
        del temp
        gc.collect()
        print 'Out of data fetching for year ' + str(year)
    print 'Out of data fetching'

    datatest = reshape_tables(dfs, dfs_erf)
    save_as_xls(datatest, alter_method=False)
def build_aggregates():

    writer = None
    years = range(2006, 2010)
    for year in years:
        yr = str(year)
        #        fname = "Agg_%s.%s" %(str(yr), "xls")
        simu = SurveySimulation()
        simu.set_config(year=yr)
        simu.set_param()
        simu.set_survey()
        inflator = get_loyer_inflator(year)
        simu.inflate_survey({'loyer': inflator})
        simu.compute()

        agg = Aggregates()
        agg.set_simulation(simu)
        agg.compute()

        if writer is None:
            writer = ExcelWriter(str(fname_all))
        agg.aggr_frame.to_excel(writer,
                                yr,
                                index=False,
                                header=True,
                                float_format="%.2f")
        print agg.aggr_frame.to_string()
        del simu
        del agg
        import gc
        gc.collect()

    writer.save()
    age = survey.get_value('age')
    if sum((quifam >= 2) & (age >= 21)) != 0:
        print "they are kids that are of age >= 21"


    # Problemes
    # enfants de plus de 21 ans et parents à charge dans les familles avec quifam=0

#    idmen = survey.get_value('idmen')
#    from numpy import max as max_
#    print max_(idmen)



if __name__ == '__main__':
    year = 2006
    simulation = SurveySimulation()
    simulation.set_config(year = year)
    simulation.set_param()
    simulation.set_survey()
    ok, message = check_inputs_enumcols(simulation)
    if not ok:
        print message
    ok, message = check_entities(simulation)
    if not ok:
        print message
    ok, message = check_weights(simulation)
    if not ok:
        print message

Beispiel #43
0
def build_aggregates():
    #    writer = None
    years = range(2009, 2010)
    for year in years:
        yr = str(year)
        #        fname = "Agg_%s.%s" %(str(yr), "xls")
        simu = SurveySimulation()
        simu.set_config(year=yr)
        simu.set_param()
        simu.set_survey()
        simu.compute()
        variables = ["garext", "ci_garext", "inthab", "ppe_brute", "rni"]
        x = simu.aggregated_by_entity("men", variables, all_output_vars=False)
        #        df = x[0]
        #        print df["ci_garext"].describe()
        agg = Aggregates()
        agg.set_simulation(simu)
        agg.show_default = False
        agg.show_real = False
        agg.show_diff = False
        agg.varlist = var_list  # ERROR: var_list is undefined.
        agg.compute()
        cols = agg.aggr_frame.columns[:4]
        print agg.aggr_frame[cols].to_string()
        #        if writer is None:
        #            writer = ExcelWriter(str(fname_all))
        #        agg.aggr_frame.to_excel(writer, yr, index= False, header= True)
        del simu
        del agg
        import gc
        gc.collect()
Beispiel #44
0
def test_laurence():
    '''
    Computes the openfisca/real numbers comparaison table in excel worksheet.

    Warning: To add more years you'll have to twitch the code manually.
    Default is years 2006 to 2009 included.
    '''
    def save_as_xls(df, alter_method = True):
        # Saves a datatable under Excel table using XLtable
        if alter_method:
            filename = "C:\desindexation.xls"
            print filename
            writer = ExcelWriter(str(filename))
            df.to_excel(writer)
            writer.save()
        else:
            # XLtable utile pour la mise en couleurs, reliefs, etc. de la table, inutile sinon
            stxl = XLtable(df)
            # <========== HERE TO CHANGE OVERLAY ======>
            wb = xlwt.Workbook()
            ws = wb.add_sheet('resultatstest')
            erfxcel = stxl.place_table(ws)
            try: # I dunno more clever commands
                wb.save("C:\outputtest.xls")
            except:
                n = random.randint(0,100)
                wb.save("C:\outputtest_"+str(n)+".xls")

#===============================================================================
#     from numpy.random import randn
#     mesures = ['cotsoc','af', 'add', 'cotsoc','af', 'add', 'cotsoc','af', 'add',
#                'cotsoc','af', 'add', 'cotsoc','af', 'add', 'cotsoc','af', 'add',
#                'cotsoc','af', 'add', 'cotsoc','af', 'add', 'cotsoc','af', 'add']
#     sources = ['of', 'of', 'of', 'erfs', 'erfs', 'erfs', 'reel', 'reel', 'reel',
#                'of', 'of', 'of', 'erfs', 'erfs', 'erfs', 'reel', 'reel', 'reel',
#                'of', 'of', 'of', 'erfs', 'erfs', 'erfs', 'reel', 'reel', 'reel']
#     year = ['2006', '2006', '2006', '2006', '2006', '2006', '2006', '2006', '2006',
#             '2007', '2007', '2007', '2007', '2007', '2007', '2007', '2007', '2007',
#             '2008', '2008', '2008', '2008', '2008', '2008', '2008', '2008', '2008']
#     ind = zip(*[mesures,sources, year])
# #     print ind
#     from pandas.core.index import MultiIndex
#     ind = MultiIndex.from_tuples(ind, names = ['mesure', 'source', 'year'])
# #     print ind
#     d = pd.DataFrame(randn(27,2), columns = ['Depenses', 'Recettes'], index = ind)
#     d.reset_index(inplace = True, drop = False)
#     d = d.groupby(by = ['mesure', 'source', 'year'], sort = False).sum()
#     print d
#     d_unstacked = d.unstack()
#     print d
#     indtemp1 = d.index.get_level_values(0)
#     indtemp2 = d.index.get_level_values(1)
#     indexi = zip(*[indtemp1, indtemp2])
#     print indexi
#     indexi_bis = []
#     for i in xrange(len(indexi)):
#         if indexi[i] not in indexi_bis:
#             indexi_bis.append(indexi[i])
#     indexi = indexi_bis
#     indexi = MultiIndex.from_tuples(indexi, names = ['Mesure', 'source'])
#     print indexi
#     d_unstacked = d_unstacked.reindex_axis(indexi, axis = 0)
#     print d_unstacked.to_string()
#     save_as_xls(d_unstacked)
#     return
#===============================================================================

    def reshape_tables(dfs, dfs_erf):
        agg = Aggregates()

        # We need this for the columns labels to work

        print 'Resetting index to avoid later trouble on manipulation'
        for d in dfs:
            d.reset_index(inplace = True)
            d.set_index('Mesure', inplace = True, drop = False)
            d.reindex_axis(labels_variables, axis = 0)
            d.reset_index(inplace = True, drop = True)
#             print d.to_string()
        for d in dfs_erf:
            d.reset_index(inplace = True)
            d['Mesure'] = agg.labels['dep']
            d.set_index('index', inplace = True, drop = False)
            d.reindex_axis(agg.labels.values(), axis = 0)
            d.reset_index(inplace = True, drop = True)
#             print d.to_string()

        # Concatening the openfisca tables for =/= years
        temp = pd.concat([dfs[0],dfs[1]], ignore_index = True)
        temp = pd.concat([temp,dfs[2]], ignore_index = True)
        temp = pd.concat([temp,dfs[3]], ignore_index = True)
        del temp[agg.labels['entity']], temp['index']
        gc.collect()

        print 'We split the real aggregates from the of table'
        temp2 = temp[[agg.labels['var'], agg.labels['benef_real'], agg.labels['dep_real'], 'year']]
        del temp[agg.labels['benef_real']], temp[agg.labels['dep_real']]
        temp['source'] = 'of'
        temp2['source'] = 'reel'
        temp2.rename(columns = {agg.labels['benef_real'] : agg.labels['benef'],
                                agg.labels['dep_real'] : agg.labels['dep']},
                     inplace = True)
        temp = pd.concat([temp,temp2], ignore_index = True)

        print 'We add the erf data to the table'
        for df in dfs_erf:
            del df['level_0'], df['Mesure']
            df.rename(columns = {'index' : agg.labels['var'], 1 : agg.labels['dep']}, inplace = True)
        temp3 = pd.concat([dfs_erf[0], dfs_erf[1]], ignore_index = True)
        temp3 = pd.concat([temp3, dfs_erf[2]], ignore_index = True)
        temp3 = pd.concat([temp3, dfs_erf[3]], ignore_index = True)
        temp3['source'] = 'erfs'
        gc.collect()
        temp = pd.concat([temp, temp3], ignore_index = True)
#         print temp.to_string()

        print 'Index manipulation to reshape the output'
        temp.reset_index(drop = True, inplace = True)
        # We set the new index
#         temp.set_index('Mesure', drop = True, inplace = True)
#         temp.set_index('source', drop = True, append = True, inplace = True)
#         temp.set_index('year', drop = False, append = True, inplace = True)
        temp = temp.groupby(by=["Mesure", "source", "year"], sort = False).sum()
        # Tricky, the [mesure, source, year] index is unique so sum() will return the only value
        # Groupby automatically deleted the source, mesure... columns and added them to index
        assert(isinstance(temp, pd.DataFrame))
#         print temp.to_string()

        # We want the years to be in columns, so we use unstack
        temp_unstacked = temp.unstack()
        # Unfortunately, unstack automatically sorts rows and columns, we have to reindex the table :

        ## Reindexing rows
        from pandas.core.index import MultiIndex
        indtemp1 = temp.index.get_level_values(0)
        indtemp2 = temp.index.get_level_values(1)
        indexi = zip(*[indtemp1, indtemp2])
        indexi_bis = []
        for i in xrange(0,len(indexi)):
            if indexi[i] not in indexi_bis:
                indexi_bis.append(indexi[i])
        indexi = indexi_bis
        del indexi_bis
        indexi = MultiIndex.from_tuples(indexi, names = ['Mesure', 'source'])
#         import pdb
#         pdb.set_trace()
        temp_unstacked = temp_unstacked.reindex_axis(indexi, axis = 0) # axis = 0 for rows, 1 for columns

        ## Reindexing columns
        # TODO : still not working
        col_indexi = []
        for col in temp.columns.get_level_values(0).unique():
            for yr in range(2006,2010):
                col_indexi.append((col, str(yr)))
        col_indexi = MultiIndex.from_tuples(col_indexi)
#         print col_indexi
#         print temp_unstacked.columns
        print col_indexi
#         temp_unstacked = temp_unstacked.reindex_axis(col_indexi, axis = 1)

        # Our table is ready to be turned to Excel worksheet !
        print temp_unstacked.to_string()
        temp_unstacked.fillna(0, inplace = True)
        return temp_unstacked



    dfs = []
    dfs_erf = []
    for i in range(2006,2010):
        year = i
        yr = str(i)
        # Running a standard SurveySim to get aggregates
        simulation = SurveySimulation()
        survey_filename = os.path.join(model.DATA_DIR, 'sources', 'test.h5')
        simulation.set_config(year=yr, survey_filename=survey_filename)
        simulation.set_param()
        simulation.compute()
        agg = Aggregates()
        agg.set_simulation(simulation)
        agg.compute()
        df = agg.aggr_frame
        df['year'] = year
        label_by_name = dict(
            (name, column.label)
            for name, column in simulation.output_table.column_by_name.iteritems()
            )
        #colonnes = simulation.output_table.table.columns
        dfs.append(df)
        variables = agg.varlist
        labels_variables = [
            label_by_name[variable]
            for variable in variables
            ]
        del simulation, agg, df
        gc.collect()

        #Getting ERF aggregates from ERF table
        temp = build_erf_aggregates(variables=variables, year= year)
        temp.rename(columns = label_by_name, inplace = True)
        temp = temp.T
        temp.reset_index(inplace = True)
        temp['year'] = year
        dfs_erf.append(temp)
        del temp
        gc.collect()
        print 'Out of data fetching for year ' + str(year)
    print 'Out of data fetching'

    datatest = reshape_tables(dfs, dfs_erf)
    save_as_xls(datatest, alter_method = False)
# Licensed under the terms of the GVPLv3 or later license
# (see openfisca/__init__.py for details)

# Author: Victor Le Breton

from openfisca_core.simulations import SurveySimulation, Simulation, ScenarioSimulation
import os
from openfisca_core import model

from .common import load_content


if __name__ == "__main__":

    # Code pour tester save_content
    yr = 2006
    simulation = SurveySimulation()
    survey_filename = os.path.join(model.DATA_DIR, 'sources', 'test.h5')
    simulation.set_config(year=yr, survey_filename=survey_filename)
    simulation.set_param()
    simulation.compute()
    print simulation.__dict__.keys()

    print simulation.output_table.__dict__.keys()
    print 'done'
    simulation.save_content('testundeux', 'fichiertestundeux')

    a = load_content('testundeux', 'fichiertestundeux')
    print a.output_table.column_by_name
    print a.input_table.table['idfoy'][0:50]
Beispiel #46
0
            bigtemp['table'] = bigtemp['table'].groupby(['idmen','quimen'], sort = False).sum()
            print "Table of values for %s dependencies : \n" %col
            print bigtemp['table'].to_string()
            del bigtemp['table'], bigtemp['options']
            gc.collect()

if __name__ == '__main__':

    restart = True
    survey = 'survey.h5'
    save_path = os.path.join(model.DATA_DIR, 'erf')
    saved_simulation_filename = os.path.join(save_path, 'debugger_' + survey[:-3])

    if restart:
        year = 2006
        simulation = SurveySimulation()
        if survey == 'survey.h5':
            survey_filename = os.path.join(model.DATA_DIR, survey)
        else:
            survey_filename = os.path.join(model.DATA_DIR, 'sources', survey)

        simulation.set_config(year=year, survey_filename=survey_filename)
        simulation.set_param()
        simulation.compute()
        simulation.save_content('debug', saved_simulation_filename)

    deb = Debugger()
    deb.set_simulation(name = 'debug', filename = saved_simulation_filename)
    deb.set_variable('af')
    deb.show_aggregates()
    deb.preproc()
def compar_num_table():

    writer = None
    years = range(2006, 2007)
    tot1 = 0
    tot3 = 0
    filename = destination_dir + 'output3.h5'
    store = HDFStore(filename)
    for year in years:
        yr = str(year)
        #        fname = "Agg_%s.%s" %(str(yr), "xls")
        simu = SurveySimulation()
        simu.set_config(year=yr)
        simu.set_param()
        import time

        deb3 = time.clock()
        sous_ech = [
            6000080, 6000080, 6000195, 6000195, 6000288, 6000288, 6000499,
            6000499, 6000531, 6000531, 6000542, 6000542
        ]
        sous_ech = [
            6000191, 6000191, 6000531, 6000614, 6000195, 6000195, 6000499,
            6000499, 6000531, 6000614, 6000531, 6000614, 6000531, 6000531,
            6000195, 6000195, 6000288, 6000288, 6000499, 6000499, 6000531,
            6000542, 6000542, 6000614, 6000191
        ]

        #al
        sous_ech = [6000122, 6000865, 6001256]
        # typ_men
        sous_ech = [6006630, 6006753, 6008508]
        # foy
        sous_ech = [6036028, 6028397, 6019248]

        sous_ech = None
        simu.set_survey(num_table=3, subset=sous_ech)
        simu.compute()

        agg3 = Aggregates()
        for ent in ['ind', 'men', 'foy', 'fam']:
            tab = simu.output_table.table3[ent]
            renam = {}
            renam['wprm_' + ent] = 'wprm'
            tab = tab.rename(columns=renam)
        agg3.set_simulation(simu)
        agg3.compute()

        fin3 = time.clock()

        #        if writer is None:
        #            writer = ExcelWriter(str(fname_all))
        fname_all = os.path.join(destination_dir, 'agg3.xlsx')
        agg3.aggr_frame.to_excel(fname_all, yr, index=False, header=True)

        # export to csv to run compar in R
        for ent in ['ind', 'men', 'foy', 'fam']:
            dir_name = destination_dir + ent + '.csv'
            tab = simu.output_table.table3[ent]
            renam = {}
            renam['wprm_' + ent] = 'wprm'
            if ent == 'ind':
                ident = [
                    "idmen", "quimen", "idfam", "quifam", "idfoy", "quifoy"
                ]
            else:
                ident = ["idmen", "idfam", "idfoy"]
            for nom in ident:
                renam[nom + '_' + ent] = nom
            tab = tab.rename(columns=renam)
            order_var = ident + list(tab.columns - ident)
            tab.sort(['idmen', 'idfam',
                      'idfoy']).ix[:num_output, order_var].to_csv(dir_name)

        deb1 = time.clock()
        simu.set_survey(num_table=1, subset=sous_ech)
        simu.compute()

        agg = Aggregates()
        agg.set_simulation(simu)
        agg.compute()

        fin1 = time.clock()

        # export to csv to run compar in R
        dir_name = destination_dir + 'en1' + '.csv'
        tab = simu.output_table.table
        tab.drop([
            'idfam_fam', 'idfam_foy', 'idfam_men', 'idfoy_fam', 'idfoy_foy',
            'idfoy_men', 'idmen_men', 'idmen_fam', 'idmen_foy', 'wprm_foy',
            'wprm_fam'
        ],
                 axis=1,
                 inplace=True)
        renam = {}
        ent = 'ind'
        renam['wprm_' + ent] = 'wprm'
        ident = [
            "noi", "idmen", "quimen", "idfam", "quifam", "idfoy", "quifoy"
        ]
        for nom in ident:
            renam[nom + '_' + ent] = nom
        tab = tab.rename(columns=renam)
        order_var = ident + list(tab.columns - ident)
        tab.sort(['idmen', 'idfam', 'idfoy']).ix[:num_output,
                                                 order_var].to_csv(dir_name)

        #        if writer is None:
        #            writer = ExcelWriter(str(fname_all))
        fname_all = os.path.join(destination_dir, 'agg1.xlsx')
        agg.aggr_frame.to_excel(fname_all, yr, index=False, header=True)
        del simu
        del agg
        import gc
        gc.collect()
        tot1 += fin1 - deb1
        tot3 += fin3 - deb3
        print "Time to process 1 table :" + str(fin1 - deb1)
        print "Time to process 3 table :" + str(fin3 - deb3)
    print tot1, tot3, tot3 - tot1
def check_survey(year = 2013):
    simulation = SurveySimulation()
    simulation.set_config(year = year)
    simulation.set_param()
    simulation.compute()
def build_aggregates3():

    writer = None
    years = range(2006, 2007)
    tot1 = 0
    tot3 = 0
    for year in years:
        yr = str(year)
        #        fname = "Agg_%s.%s" %(str(yr), "xls")
        simu = SurveySimulation()
        simu.set_config(year=yr)
        simu.set_param()
        import time

        deb3 = time.clock()
        simu.set_survey(num_table=3)
        simu.compute()
        fin3 = time.clock()

        print "coucou"
        col = simu.survey.column_by_name.get("so")
        print col.entity
        agg3 = Aggregates3()
        agg3.set_simulation(simu)
        agg3.compute()
        #        if writer is None:
        #            writer = ExcelWriter(str(fname_all))
        fname_all = os.path.join(destination_dir, 'agg3.xlsx')
        agg3.aggr_frame.to_excel(fname_all, yr, index=False, header=True)

        for ent in ['ind', 'men', 'foy', 'fam']:
            dir_name = destination_dir + ent + '.csv'
            ##            simu.survey.table3[ent].to_csv(dir_name)
            #            import pdb
            #            pdb.set_trace()
            ##            com.convert_to_r_dataframe
            simu.output_table.table3[ent][:num_output].to_csv(dir_name)

        deb1 = time.clock()
        simu.set_survey(num_table=1)
        print "prob compute"
        simu.compute()
        fin1 = time.clock()

        dir_name = destination_dir + 'en1' + '.csv'
        print "prob output"
        simu.output_table.table[:num_output].to_csv(dir_name)

        agg = Aggregates()
        print "prob set"
        agg.set_simulation(simu)
        print "prob compute"
        agg.compute()

        #        if writer is None:
        #            writer = ExcelWriter(str(fname_all))
        fname_all = os.path.join(destination_dir, 'agg1.xlsx')
        print "prob ind"
        agg.aggr_frame.to_excel(fname_all, yr, index=False, header=True)
        del simu
        del agg
        import gc
        gc.collect()
        tot1 += fin1 - deb1
        tot3 += fin3 - deb3
        print "Time to process 1 table :" + str(fin1 - deb1)
        print "Time to process 3 table :" + str(fin3 - deb3)
    print tot1, tot3, tot3 - tot1
Beispiel #50
0
def compar_num_table():

    writer = None
    years = range(2006,2007)
    tot1 = 0
    tot3 = 0
    filename = destination_dir+'output3.h5'
    store = HDFStore(filename)
    for year in years:
        yr = str(year)
#        fname = "Agg_%s.%s" %(str(yr), "xls")
        simu = SurveySimulation()
        simu.set_config(year = yr)
        simu.set_param()
        import time


        deb3 = time.clock()
        sous_ech =  [6000080, 6000080, 6000195, 6000195, 6000288, 6000288, 6000499, 6000499, 6000531, 6000531, 6000542, 6000542]
        sous_ech =  [6000191, 6000191, 6000531, 6000614, 6000195, 6000195, 6000499, 6000499, 6000531, 6000614, 6000531,
        6000614, 6000531, 6000531, 6000195, 6000195, 6000288, 6000288, 6000499, 6000499, 6000531, 6000542,
         6000542, 6000614, 6000191]

        #al
        sous_ech =  [6000122, 6000865, 6001256]
        # typ_men
        sous_ech =  [6006630, 6006753, 6008508]
        # foy
        sous_ech =  [6036028, 6028397, 6019248]

        sous_ech = None
        simu.set_survey(num_table=3, subset=sous_ech)
        simu.compute()

        agg3 = Aggregates()
        for ent in ['ind','men','foy','fam']:
            tab = simu.output_table.table3[ent]
            renam={}
            renam['wprm_'+ent] = 'wprm'
            tab = tab.rename(columns=renam)
        agg3.set_simulation(simu)
        agg3.compute()

        fin3  = time.clock()


#        if writer is None:
#            writer = ExcelWriter(str(fname_all))
        fname_all = os.path.join(destination_dir, 'agg3.xlsx')
        agg3.aggr_frame.to_excel(fname_all, yr, index= False, header= True)



        # export to csv to run compar in R
        for ent in ['ind','men','foy','fam']:
            dir_name = destination_dir + ent +'.csv'
            tab = simu.output_table.table3[ent]
            renam ={}
            renam['wprm_'+ent] = 'wprm'
            if ent=='ind':
                ident = ["idmen","quimen","idfam","quifam","idfoy","quifoy"]
            else:
                ident = ["idmen","idfam","idfoy"]
            for nom in ident:
                renam[nom+'_'+ent] = nom
            tab = tab.rename(columns=renam)
            order_var = ident+list(tab.columns - ident)
            tab.sort(['idmen','idfam','idfoy']).ix[:num_output,order_var].to_csv(dir_name)


        deb1 = time.clock()
        simu.set_survey(num_table=1, subset=sous_ech)
        simu.compute()

        agg = Aggregates()
        agg.set_simulation(simu)
        agg.compute()

        fin1  = time.clock()

        # export to csv to run compar in R
        dir_name = destination_dir + 'en1' +'.csv'
        tab = simu.output_table.table
        tab.drop(['idfam_fam','idfam_foy','idfam_men','idfoy_fam','idfoy_foy','idfoy_men','idmen_men','idmen_fam','idmen_foy','wprm_foy','wprm_fam'],
                        axis=1, inplace=True)
        renam ={}
        ent = 'ind'
        renam['wprm_'+ent] = 'wprm'
        ident = ["noi","idmen","quimen","idfam","quifam","idfoy","quifoy"]
        for nom in ident:
            renam[nom+'_'+ent] = nom
        tab = tab.rename(columns=renam)
        order_var = ident+list(tab.columns - ident)
        tab.sort(['idmen','idfam','idfoy']).ix[:num_output,order_var].to_csv(dir_name)


#        if writer is None:
#            writer = ExcelWriter(str(fname_all))
        fname_all = os.path.join(destination_dir, 'agg1.xlsx')
        agg.aggr_frame.to_excel(fname_all, yr, index= False, header= True)
        del simu
        del agg
        import gc
        gc.collect()
        tot1 += fin1 - deb1
        tot3 += fin3 - deb3
        print "Time to process 1 table :" +str(fin1 - deb1)
        print "Time to process 3 table :" +str(fin3 - deb3)
    print tot1, tot3, tot3- tot1
def build_aggregates():

    writer = None
    years = range(2006,2010)
    for year in years:
        yr = str(year)
#        fname = "Agg_%s.%s" %(str(yr), "xls")
        simu = SurveySimulation()
        simu.set_config(year = yr)
        simu.set_param()
        simu.set_survey()
        inflator = get_loyer_inflator(year)
        simu.inflate_survey({'loyer' : inflator})
        simu.compute()

        agg = Aggregates()
        agg.set_simulation(simu)
        agg.compute()

        if writer is None:
            writer = ExcelWriter(str(fname_all))
        agg.aggr_frame.to_excel(writer, yr, index= False, header= True, float_format="%.2f")
        print agg.aggr_frame.to_string()
        del simu
        del agg
        import gc
        gc.collect()


    writer.save()
Beispiel #52
0
def get_common_dataframe(variables, year=2006):
    """
    Compare variables in erf an openfisca
    """
    simulation = SurveySimulation()
    simulation.set_config(year=year)
    simulation.set_param()
    simulation.set_survey()
    simulation.compute()

    erf = ErfsDataTable(year=2006)
    if "ident" not in variables:
        erf_variables = variables + ["ident"]
    else:
        erf_variables = variables

    if "wprm" not in erf_variables:
        erf_variables = erf_variables + ["wprm"]
    else:
        erf_variables = erf_variables

    erf_dataframe = erf.get_values(erf_variables, table="menage")
    erf_dataframe.rename(columns={'ident': 'idmen'}, inplace=True)
    for col in erf_dataframe.columns:
        if col is not "idmen":
            erf_dataframe.rename(columns={col: col + "_erf"}, inplace=True)

    of_dataframe, of_dataframe_default = simulation.aggregated_by_entity(
        "men", variables, all_output_vars=False, force_sum=True)
    del of_dataframe_default

    merged_df = of_dataframe.merge(erf_dataframe, on="idmen")
    del of_dataframe, erf_dataframe
    return merged_df
Beispiel #53
0
def test(year=2006, variables = ['af']):
    simulation = SurveySimulation()
    survey_filename = os.path.join(model.DATA_DIR, 'sources', 'test.h5')
    simulation.set_config(year=year, survey_filename=survey_filename)
    simulation.set_param()
    simulation.compute()

#     of_aggregates = Aggregates()
#     of_aggregates.set_simulation(simulation)
#     of_aggregates.compute()
#     print of_aggregates.aggr_frame
#
#     from openfisca_france.data.erf.aggregates import build_erf_aggregates
#     temp = (build_erf_aggregates(variables=variables, year= year))
#     print temp
#     return
    variable= "af"
    debugger = Debugger()
    debugger.set_simulation(simulation)
    debugger.set_variable(variable)
    debugger.show_aggregates()





    def get_all_ancestors(varlist):
        if len(varlist) == 0:
            return []
        else:
            if varlist[0]._parents == set():
                return ([varlist[0]]
                      + get_all_ancestors(varlist[1:]))
            else:
                return ([varlist[0]]
                 + get_all_ancestors(list(varlist[0]._parents))
                  + get_all_ancestors(varlist[1:]))

    # We want to get all ancestors + children + the options that we're going to encounter
    parents = map(lambda x: simulation.output_table.column_by_name.get(x), variables)
    parents = get_all_ancestors(parents)
    options = []
    for varcol in parents:
        options.extend(varcol._option.keys())
    options = list(set(options))
    #print options
    parents = map(lambda x: x.name, parents)
    for var in variables:
        children = set()
        varcol = simulation.output_table.column_by_name.get(var)
        children = children.union(set(map(lambda x: x.name, varcol._children)))
    variables = list(set(parents + list(children)))
    #print variables
    del parents, children
    gc.collect()

    def get_var(variable):
        variables =[variable]
        return simulation.aggregated_by_entity(entity="men", variables=variables,
                                                all_output_vars = False, force_sum=True)[0]

    simu_aggr_tables = get_var(variables[0])
    for var in variables[1:]:
        simu_aggr_tables = simu_aggr_tables.merge(get_var(var)[['idmen', var]], on = 'idmen', how = 'outer')

    # We load the data from erf table in case we have to pick data there
    erf_data = DataCollection(year=year)
    os.system('cls')
    todo = set(variables + ["ident", "wprm"]).union(set(options))
    print 'Variables or equivalents to fetch :'
    print todo

    '''
    Méthode générale pour aller chercher les variables de l'erf/eec
    ( qui n'ont pas forcément le même nom
    et parfois sont les variables utilisées pour créér l'of ):
    1 - essayer le get_of2erf, ça doit marcher pour les variables principales ( au moins les aggrégats
    que l'on compare )
    Si les variables ne sont pas directement dans la table,
    elles ont été calculées à partir d'autres variables de données erf/eec
    donc chercher dans :
    2 - build_survey
    3 - model/model.py qui dira éventuellement dans quel module de model/ chercher
    Le 'print todo' vous indique quelles variables chercher
    ( attention à ne pas inclure les enfants directs )
    L'utilisation du Ctrl-H est profitable !
    '''

    fetch_eec = ['statut','titc','chpub','encadr','prosa','age','naim','naia','noindiv']
    fetch_erf = ['zsali','af','ident','wprm','noi','noindiv','quelfic']
    erf_df = erf_data.get_of_values(variables= fetch_erf, table="erf_indivi")
    eec_df = erf_data.get_of_values(variables= fetch_eec, table="eec_indivi")
    erf_eec_indivi = erf_df.merge(eec_df, on ='noindiv', how = 'inner' )
    assert 'quelfic' in erf_eec_indivi.columns, "quelfic not in erf_indivi columns"
    del eec_df, erf_df

    # We then get the aggregate variables for the menage ( mainly to compare with of )
    print 'Loading data from erf_menage table'
    erf_menage = erf_data.get_of_values(variables= list(todo) + ['quelfic'], table="erf_menage")

    del todo
    gc.collect()
    assert 'ident' in erf_menage.columns, "ident not in erf_menage.columns"

    from openfisca_france.data.erf import get_erf2of
    erf2of = get_erf2of()
    erf_menage.rename(columns = erf2of, inplace = True)

# We get the options from the simulation non aggregated tables:

    # First from the output_table
    # We recreate the noindiv in output_table
    simulation.output_table.table['noindiv'] = 100 * simulation.output_table.table.idmen_ind + simulation.output_table.table.noi_ind
    simulation.output_table.table['noindiv'] = simulation.output_table.table['noindiv'].astype(np.int64)
    s1 = [var for var in set(options).intersection(set(simulation.output_table.table.columns))] + ['idmen_ind', 'quimen_ind', 'noindiv']
    simu_nonaggr_tables = (simulation.output_table.table)[s1]
    simu_nonaggr_tables.rename(columns = {'idmen_ind' : 'idmen', 'quimen_ind':'quimen'}, inplace = True)
    assert 'noindiv' in simu_nonaggr_tables.columns

    # If not found, we dwelve into the input_table
    if (set(s1)- set(['idmen_ind', 'quimen_ind','noindiv'])) < set(options):
        assert 'noindiv' in simulation.input_table.table.columns, "'noindiv' not in simulation.input_table.table.columns"
        s2 = [var for var in (set(options).intersection(set(simulation.input_table.table.columns)) - set(s1))] + ['noindiv']
        #print s2
        temp = simulation.input_table.table[s2]
        simu_nonaggr_tables = simu_nonaggr_tables.merge(temp, on = 'noindiv', how = 'inner', sort = False)

        del s2, temp
    del s1
    gc.collect()

    simu_nonaggr_tables = simu_nonaggr_tables[list(set(options)) + ['idmen', 'quimen','noindiv']]
    #print options, variables
    assert 'idmen' in simu_nonaggr_tables.columns, 'Idmen not in simu_nonaggr_tables columns'

    # Check the idmens that are not common
    erf_menage.rename(columns = {'ident' : 'idmen'}, inplace = True)

    print "\n"
    print 'Checking if idmen is here...'
    print '\n ERF : '
    print 'idmen' in erf_menage.columns
    print "\n Simulation output"
    print 'idmen' in simu_aggr_tables.columns
    print "\n"

    #print 'Dropping duplicates of idmen for both tables...'
    assert not erf_menage["idmen"].duplicated().any(), "Duplicated idmen in erf_menage"
    #erf_menage.drop_duplicates('idmen', inplace = True)
    simu_aggr_tables.drop_duplicates('idmen', inplace = True)
    assert not simu_aggr_tables["idmen"].duplicated().any(), "Duplicated idmen in of"

    print 'Checking mismatching idmen... '
    s1 = set(erf_menage['idmen']) - (set(simu_aggr_tables['idmen']))
    if s1:
        print "idmen that aren't in simu_aggr_tables : %s" %str(len(s1))
        pass
    s2 = (set(simu_aggr_tables['idmen'])) - set(erf_menage['idmen'])
    if s2:
        print "idmen that aren't in erf_menage : %s" %str(len(s2))
        pass
    del s1, s2

    # Restrict to common idmens and merge
    s3 = set(erf_menage['idmen']).intersection(set(simu_aggr_tables['idmen']))
    print "Restricting to %s common idmen... \n" %str(len(s3))
    erf_menage = erf_menage[erf_menage['idmen'].isin(s3)]
    simu_aggr_tables = simu_aggr_tables[simu_aggr_tables['idmen'].isin(s3)]
    del s3
    gc.collect()

    #print erf_menage.columns
    #print simu_aggr_tables.columns

    # Compare differences across of and erf dataframes
    print "Comparing differences between dataframes... \n"
    colcom = (set(erf_menage.columns).intersection(set(simu_aggr_tables.columns))) - set(['idmen','wprm'])
    print 'Common variables: '
    print colcom
    erf_menage.reset_index(inplace = True)
    simu_aggr_tables.reset_index(inplace = True)
    for col in colcom:
        temp = set(erf_menage['idmen'][erf_menage[col] != simu_aggr_tables[col]])
        print "Numbers of idmen that aren't equal on variable %s : %s \n" %(col, str(len(temp)))
        del temp


    # Detect the biggest differences
    bigtable = merge(erf_menage, simu_aggr_tables, on = 'idmen', how = 'inner', suffixes=('_erf','_of'))
    print 'Length of new dataframe is %s' %str(len(bigtable))
    #print bigtable.columns
    bigtable.set_index('idmen', drop = False, inplace = True)

    already_met = []
    options_met = []

    for col in colcom:
        bigtemp = None
        table = bigtable[and_(bigtable[col+'_erf']!=0,bigtable[col+'_of']!=0)]
        table[col] = (table[col+'_erf'] - table[col+'_of']) / table[col+'_erf'] #Difference relative
        table[col] = table[col].apply(lambda x: abs(x))
        print 'Minimum difference between the two tables for %s is %s' %(col, str(table[col].min()))
        print 'Maximum difference between the two tables for %s is %s' %(col, str(table[col].max()))
        print table[col].describe()
        try:
            assert len(table[col]) == len(table['wprm_of']), "PINAGS"
            dec, values = mwp(table[col], np.arange(1,11), table['wprm_of'], 2, return_quantiles=True)
            #print sorted(values)
            dec, values = mwp(table[col], np.arange(1,101), table['wprm_erf'], 2, return_quantiles=True)
            #print sorted(values)[90:]
            del dec, values
            gc.collect()
        except:
            #print 'Weighted percentile method didnt work for %s' %col
            pass
        print "\n"

    # Show the relevant information for the most deviant households
        table.sort(columns = col, ascending = False, inplace = True)
        #print table[col][0:10].to_string()
        if bigtemp is None:
            bigtemp = {'table' : table[[col, col+'_of', col+'_erf', 'idmen']][0:10],
                       'options' : None}
        bigtemp['table'][col+'div'] = bigtemp['table'][col+'_of'] / bigtemp['table'][col+'_erf']
        print bigtemp['table'].to_string()

        '''
        bigtemp is the table which will get filled little by little by the relevant variables.
        Up to the last rows of code 'table' refers to a table of aggregated values,
        while 'options is a table of individual variables.
        The reason we call it in a dictionnary is also because we modify it inside the recursive function 'iter_on parents',
        and it causes an error in Python unless for certain types like dictionnary values.
        '''
        #print "\n"

        # If variable is a Prestation, we show the dependancies
        varcol = simulation.output_table.column_by_name.get(col)
        if isinstance(varcol, Prestation):

            '''
            For the direct children
            '''
            if not varcol._children is None:
                ch_to_fetch = list(varcol._children)
                ch_to_fetch = map(lambda x: x.name, ch_to_fetch)
                ch_fetched = []

                if set(ch_to_fetch) <= set(simu_aggr_tables.columns):
                    print "Variables which need %s to be computed :\n %s \n" %(col, str(ch_to_fetch))
                    for var in ch_to_fetch:
                        if var + '_of' in table.columns:
                            ch_fetched.append(var + '_of')
                        else:
                            ch_fetched.append(var)
                elif set(ch_to_fetch) <= set(simu_aggr_tables.columns).union(erf_menage.columns):
                    print "Variables which need %s to be computed (some missing picked in erf):\n %s \n" %(col, str(ch_to_fetch))
                    for var in ch_to_fetch:
                        if var in simu_aggr_tables.columns:
                            if var + '_of' in table.columns:
                                ch_fetched.append(var + '_of')
                        elif var + '_erf' in table.columns:
                                ch_fetched.append(var + '_erf')
                        else:
                            ch_fetched.append(var)
                else:
                    print "Variables which need %s to be computed (some missing):\n %s \n" %(col, str(ch_to_fetch))
                    for var in ch_to_fetch:

                        if var in simu_aggr_tables.columns:
                            if var + '_of' in table.columns:
                                ch_fetched.append(var + '_of')
                        elif var in erf_menage.columns:
                            if var + '_erf' in table.columns:
                                ch_fetched.append(var + '_erf')

                print table[[col] + ch_fetched][0:10]
                print "\n"
                del ch_to_fetch, ch_fetched

            '''
            For the parents
            '''
            def iter_on_parents(varcol):
                if (varcol._parents == set() and varcol._option == {}) or varcol.name in already_met:
                    return
                else:
                    par_to_fetch = list(varcol._parents)
                    par_to_fetch = map(lambda x: x.name, par_to_fetch)
                    par_fetched = []

                    if set(par_fetched) <= set(simu_aggr_tables.columns):
                        #print "Variables the prestation %s depends of :\n %s \n" %(varcol.name, str(par_fetched))
                        for var in par_fetched:
                            if var + '_of' in table.columns:
                                par_fetched.append(var + '_of')
                            else:
                                par_fetched.append(var)
                    elif set(par_fetched) <= set(simu_aggr_tables.columns).union(erf_menage.columns):
                        #print "Variables the prestation %s depends of (some missing picked in erf):\n %s \n" %(varcol.name,str(par_fetched))
                        for var in par_fetched:
                            if var in simu_aggr_tables.columns:
                                if var + '_of' in table.columns:
                                    par_fetched.append(var + '_of')
                            elif var + '_erf' in table.columns:
                                par_fetched.append(var + '_erf')
                            else:
                                par_fetched.append(var)
                    else:
                        for var in par_fetched:
                            if var in simu_aggr_tables.columns:
                                if var + '_of' in table.columns:
                                    par_fetched.append(var + '_of')
                            elif var in erf_menage.columns:
                                if var + '_erf' in table.columns:
                                    par_fetched.append(var + '_erf')
                        if len(par_fetched) > 0:
                            #print "Variables the prestation %s depends of (some missing):\n %s \n" %(varcol.name, str(par_fetched))
                            pass
                        else:
                            #print "Variables the prestation %s depends of couldn't be found :\n %s \n" %(varcol.name, str(par_fetched))
                            pass

                    if len(par_fetched) > 0:
                        temp = table[[col, 'idmen'] + par_fetched][0:10]
                        bigtemp['table'] = pd.merge(temp, bigtemp['table'], how = 'inner')
                        #print temp.to_string(), "\n"
                    if varcol._option != {} and not set(varcol._option.keys()) < set(options_met):
                        vars_to_fetch = list(set(varcol._option.keys())-set(options_met))
                        #print "and the options to current variable %s for the id's with strongest difference :\n %s \n" %(varcol.name, varcol._option.keys())
                        liste = [i for i in range(0,10)]
                        liste = map(lambda x: table['idmen'].iloc[x], liste)
                        temp = simu_nonaggr_tables[['idmen', 'quimen','noindiv']
                                                  + vars_to_fetch][simu_nonaggr_tables['idmen'].isin(table['idmen'][0:10])]

                        temp_sorted = temp[temp['idmen'] == liste[0]]
                        for i in xrange(1,10):
                            temp_sorted = temp_sorted.append(temp[temp['idmen'] == liste[i]])
                        if bigtemp['options'] is None:
                            bigtemp['options'] = temp_sorted
                            bigtemp['options'] = bigtemp['options'].merge(erf_eec_indivi, on = 'noindiv', how = 'outer')
                        else:
                            bigtemp['options'] = bigtemp['options'].merge(temp_sorted, on = ['noindiv','idmen','quimen'], how = 'outer')
#                         temp_sorted.set_index(['idmen',  'quimen'], drop = True, inplace = True) # If we do that
                        del temp, temp_sorted
                        gc.collect()

                    already_met.append(varcol.name)
                    options_met.extend(varcol._option.keys())
                    for var in varcol._parents:
                        iter_on_parents(var)

            iter_on_parents(varcol)
            # We merge the aggregate table with the option table ( for each individual in entity )
            bigtemp['table'] = bigtemp['table'].merge(bigtemp['options'],
                                                       how = 'left',
                                                        on = 'idmen',
                                                         suffixes = ('(agg)', '(ind)'))

            # Reshaping the table to group by descending error on col, common entities
            bigtemp['table'].sort(columns = ['af','quimen'], ascending = [False,True], inplace = True)
            bigtemp['table'] = bigtemp['table'].groupby(['idmen','quimen'], sort = False).sum()
            print "Table of values for %s dependencies : \n" %col
            print bigtemp['table'].to_string()
            del bigtemp['table'], bigtemp['options']
            gc.collect()
    def run_OF(self):
        '''
        Lance le calculs sur OF à partir des cas-types issues de TaxIPP
        input : base .dta issue de l'étape précédente
        '''
        dta_input = self.paths['dta_input']
        dic = self.dic_scenar
        
        def _test_of_dta(dta_input, dic):
            ''' Cette fonction teste que la table .dta trouvée 
            correspond au bon scénario '''
            data = read_stata(dta_input)
            dic_dta =  data.loc[0,'dic_scenar']
            if str(dic) != str(dic_dta) :
                print "La base .dta permettant de lancer la simulation OF est absente "
                print "La base s'en rapprochant le plus a été construite avec les paramètres : ", dic_dta
                
                pdb.set_trace()
            else :
                data = data.drop('dic_scenar', 1)
            return data
        
        def _adaptation_var(data, dic_var):

            def _quifoy(col):
                # TODO, il faut gérer les pac
                try:
                    quifoy = col["conj"]==1 + col["pac"]==1 # TODO: faux 
                except:
                    quifoy = col["conj"]==1
                return quifoy
            
            def _quimen(col):
                # TODO, il faut gérer les enfants
                try:
                    quimen = col["conj"]==1 + col["concu"]==1 + col["pac"]==1
                except:
                    quimen = col["conj"]==1 + col["concu"]==1 
                return quimen
            
            def _so(data):
                data["so"] = 0
                data.loc[data['proprio_empr'] == 1, 'so'] = 1
                data.loc[data['proprio'] == 1, 'so'] = 2
                data.loc[data['locat'] == 1, 'so'] = 4
                data.loc[data['loge'] == 1, 'so'] = 6
                return data
            
            def _compl(var):
                var = 1- var
                return var
            
            def _count_by_ff(var):
                ''' Compte le nombre de bouléen == 1 au sein du foyer fiscal'''
                nmen = 10
                for i in range(1, nmen):
                    compteur=0
                    j=0
                    while data['idfoy'] == i:
                        j += j
                        if var ==1:
                            compteur += compteur
                        else :  
                            compteur = compteur
                        data[j,'var'] = compteur
                
            def _enf(data):
                data["enf_college"] = 0
                if  (11<data['age']<15):
                    data['enf_college'] = 1
                else:
                    data['enf_college'] = 0
#                data["enf_lycee"] = (data['age'] > 14 & data['age']<19)
#                data["enf_sup"] = (data['age'] >18)
#                data["f7ea"] = _count_by_ff(data["enf_college"]) 
#                data["nenf1113"] + data["nenf1415"] #11-14
#                data["f7ec"] = _count_by_ff(data["enf_lycee"]) #data["nenf1617"] #15-17
#                data["f7ef"] = _count_by_ff(data["enf_sup"]) #data["nenfmaj1819"] + data["nenfmaj20"] + data["nenfmaj21plus"] #>17
                data.drop(["nenf1113", "nenf1415", "nenf1617", "nenfmaj1819", "nenfmaj20",
                                   "nenfmaj21plus", "nenfnaiss", "nenf02",  "nenf35",  "nenf610"], axis = 1, inplace=True)
                return data
            
            def _workstate(data):
                data['chpub'] = 0
                data.loc[data['public'] == 1, 'chpub'] = 1
                data.loc[data['public'] == 0, 'chpub' ] = 6
                return data
            
            data.rename(columns= dic_var, inplace=True)
            data["agem"] = 12*data["age"]
            
            data["idfam"] = data["idmen"]
            data["quifoy"] = data.apply(_quifoy, axis=1).astype(int)
            data["quimen"] = data.apply(_quimen, axis=1).astype(int)
            data["quifam"] = data['quimen']
            
            data = _so(data)
            #data = _enf(data)
            data = _workstate(data)
            print data.columns
            #data["caseN"] = _comp(data["caseN"])
            doubt = ["rfin"]
            
            not_in_OF = [ "p1", "nbh", "nbh_sal", "loge_proprio",  "loge_locat",  "loge_autr", "loyer_fictif",  "loyer_verse",  "loyer_marche", "pens_alim_ver_foy", "sal_brut",  "sal_h_brut",
                         "bail_prive",  "bail_pers_phys",  "loyer_conso",  "proprio_men",  "locat_men", "loge_men",  "proprio_empr_men", "loyer_fictif_men", 
                         "bail_prive_men",  "bail_pers_phys_men", "loyer_marche_men", "loyer_conso_men",
                          "ba_irpp",  "bic_irpp",  "bnc_irpp",  "nonsalexo_irpp", "nonsal_brut_cn", "nonsal_brut_cn_foy", "nonsal_brut", "nonsal_h_brut"] # variables non-salariés
            other_vars_to_drop = ["couple", "decl", "conj", "pac", "proprio_empr", "proprio", "locat", "nonsal_irpp", "nadul", 
                          "loge", "marie", "change", "pondv", "concu", "cohab", "nenf_concu", "num_indf", "npers", "age_conj", "n_foy_men", "public"]
            vars_to_drop = [var for var in (other_vars_to_drop + not_in_OF) if var in data.columns]            
            data = data.drop(vars_to_drop, axis=1)
            data.rename(columns={"id_conj" : "conj"}, inplace = True)
            # print data.columns
            return data
        
        data_IPP = _test_of_dta(dta_input, dic)
        openfisca_survey =  _adaptation_var(data_IPP, self.dic_var_input)
        openfisca_survey = openfisca_survey.fillna(0)

        simulation = SurveySimulation()
        simulation.set_config(year=self.datesim, survey_filename = openfisca_survey)
        simulation.set_param()
        simulation.compute()
        
        self.simulation = simulation
        self.openfisca_outputput = simulation.output_table.table
        return openfisca_survey
Beispiel #55
0
from pandas import HDFStore

from openfisca_core import model
from openfisca_core.simulations import SurveySimulation

filename = os.path.join(model.DATA_DIR, 'survey3.h5')

num_table = 3

input = HDFStore(filename)
survey = tables.openFile(destination_dir + "survey3.h5", mode="w")

years = ['2006']

for yr in years:
    simu = SurveySimulation()
    simu.set_config(year=yr)
    simu.set_param()
    simu.set_survey(num_table=num_table)
    survey_year = survey.createGroup("/", "survey_" + yr, "year")
    if num_table == 3:
        for ent in ['ind', 'men', 'foy', 'fam']:
            tab = simu.survey.table3[ent]
            tab_type = tab.to_records(index=False).dtype
            survey_table = survey.createTable('/survey_' + yr, ent, tab_type)
            survey_table.append(tab.to_records(index=False))
            survey_table.flush()
    if num_table == 1:
        tab = simu.survey.table
        tab_type = tab.to_records(index=False).dtype
        to_remote = ['opt_colca', 'quelfic']
def run_OF(dic_input, path_dta_input, param_scenario = None, dic = None, datesim = None, option= 'test_dta'):

    '''
    Lance le calculs sur OF à partir des cas-types issues de TaxIPP
    input : base .dta issue de l'étape précédente
    '''
    def _test_of_dta(dta_input, dic):
        ''' Cette fonction teste que la table .dta trouvée 
        correspond au bon scénario '''
        data = read_stata(dta_input)
        dic_dta =  data.loc[0,'dic_scenar']
        if str(dic) != str(dic_dta) :
            print "La base .dta permettant de lancer la simulation OF est absente "
            print "La base s'en rapprochant le plus a été construite avec les paramètres : ", dic_dta
            pdb.set_trace()
        else :
            data = data.drop('dic_scenar', 1).sort(['id_foyf', 'id_indiv'], ascending=[True, False])
        return data
    
    def _scenar_dta(dta_input) :
        ''' cette fonction identifie le scenario enregistré dans la table d'input '''
        data = read_stata(dta_input)
        dic_dta =  data.loc[0,'dic_scenar']
        data = data.drop('dic_scenar', 1).sort(['id_foyf', 'id_indiv'], ascending=[True, False])
        return dic_dta, data
    
    if option == 'test_dta':
        data_IPP = _test_of_dta(path_dta_input, dic)
    if option == 'list_dta':
        dic_scenar, data_IPP = _scenar_dta(path_dta_input)
        dict_scenar = dict()
        expression = "dict_scenar.update(" + dic_scenar +")"
        eval(expression)
        datesim = dict_scenar['datesim']
        param_scenario = dict_scenar
        
    if 'salbrut' in param_scenario.items() :
        if param_scenario['option'] == 'salbrut':
            import openfisca_france
            from openfisca_core import model
            from openfisca_core.simulations import SurveySimulation
            openfisca_france.init_country(start_from = "brut")
            del dic_input['sal_irpp_old']
            dic_input['sal_brut'] = 'salbrut'
        else : 
            import openfisca_france
            from openfisca_core import model
            from openfisca_core.simulations import SurveySimulation
            openfisca_france.init_country()
        
    else : 
        import openfisca_france
        from openfisca_core import model
        from openfisca_core.simulations import SurveySimulation
        openfisca_france.init_country()
          
    openfisca_survey =  build_input_OF(data_IPP, dic_input)
    openfisca_survey = openfisca_survey.fillna(0)#.sort(['idfoy','noi'])
    simulation = SurveySimulation()
    simulation.set_config(year= datesim, 
                          survey_filename = openfisca_survey,
                          param_file = os.path.join(os.path.dirname(model.PARAM_FILE), 'param.xml'))
    simulation.set_param()
    simulation.compute()
    
    if option == 'list_dta':
        return simulation, simulation.output_table.table, param_scenario
    else: 
        return simulation, simulation.output_table.table