def cn_df_generator(year, list_years=None, drop_duplicates=True, subset=None): """ Generates the table with all the data from Comptabilite Nationale. Parameters ---------- year : int year of INSEE data realease list_years : list of integers list of years of interest. Optional. Example -------- >>> year = 2013 >>> list_years = None >>> table2013 = cn_df_generator(2013, list_years = range(1949, 2014)) Returns the main table of comptabilite nationale data for all years from 1949 to 2013. """ tee_df_by_year = tee_df_by_year_generator( year, list_years) # arguments: (year, [years_list]) non_tee_df_by_filename = non_tee_df_by_filename_generator( year) # arguement: (year) df_full = pandas.DataFrame() for key, value in tee_df_by_year.items(): df_full = df_full.append(value, ignore_index=True) for key, value in non_tee_df_by_filename.items(): df_full = df_full.append(value, ignore_index=True) df_full[['year']] = df_full[['year']].astype(int) if not subset: subset = [u'code', u'institution', u'ressources', u'value', u'year'] if drop_duplicates: df_full.drop_duplicates(subset=subset, inplace=True) if year == 2011: df_full['value_rounded'] = numpy.around( df_full['value'].astype('float64'), 3) else: df_full['value_rounded'] = numpy.around( df_full['value'].astype('float64'), 5) df_full = df_full.drop_duplicates( ['code', 'institution', 'ressources', 'value_rounded', 'year']) return df_full
def cn_df_generator(year, list_years = None, drop_duplicates = True, subset = None): """ Generates the table with all the data from Comptabilite Nationale. Parameters ---------- year : int year of INSEE data realease list_years : list of integers list of years of interest. Optional. Example -------- >>> year = 2013 >>> list_years = None >>> table2013 = cn_df_generator(2013, list_years = range(1949, 2014)) Returns the main table of comptabilite nationale data for all years from 1949 to 2013. """ tee_df_by_year = tee_df_by_year_generator(year, list_years) # arguments: (year, [years_list]) non_tee_df_by_filename = non_tee_df_by_filename_generator(year) # arguement: (year) df_full = pandas.DataFrame() for key, value in tee_df_by_year.items(): df_full = df_full.append(value, ignore_index = True) for key, value in non_tee_df_by_filename.items(): df_full = df_full.append(value, ignore_index = True) df_full[['year']] = df_full[['year']].astype(int) if not subset: subset = [u'code', u'institution', u'ressources', u'value', u'year'] if drop_duplicates: df_full.drop_duplicates(subset = subset, inplace = True) if year == 2011: df_full['value_rounded'] = numpy.around(df_full['value'].astype('float64'), 3) else: df_full['value_rounded'] = numpy.around(df_full['value'].astype('float64'), 5) df_full = df_full.drop_duplicates(['code', 'institution', 'ressources', 'value_rounded', 'year']) return df_full
def test_duplicate_tee_df(): folder_year = 2013 tee_df_by_year = parser_tee.tee_df_by_year_generator(folder_year) for key, df in tee_df_by_year.items(): for element in df.duplicated(): assert element == 0, "There are duplicate rows in TEE " + key + ", in folder: comptes_annees " + folder_year