Example #1
0
def test_duplicate_non_tee_df():
    folder_year = 2013
    non_tee_df_by_filename = parser_non_tee.non_tee_df_by_filename_generator(
        folder_year)
    for key, df in non_tee_df_by_filename.items():
        for element in df.duplicated():
            assert element == 0, "There are duplicate rows in " + key + ", in folder: comptes_annees " + folder_year
Example #2
0
def cn_df_generator(year, list_years=None, drop_duplicates=True, subset=None):
    """
    Generates the table with all the data from Comptabilite Nationale.

    Parameters
    ----------
    year : int
        year of INSEE data realease
    list_years : list of integers
        list of years of interest. Optional.

    Example
    --------
    >>> year = 2013
    >>> list_years = None
    >>> table2013 = cn_df_generator(2013, list_years = range(1949, 2014))

    Returns the main table of comptabilite nationale data for all years from 1949 to 2013.
    """
    tee_df_by_year = tee_df_by_year_generator(
        year, list_years)  # arguments: (year, [years_list])
    non_tee_df_by_filename = non_tee_df_by_filename_generator(
        year)  # arguement: (year)

    df_full = pandas.DataFrame()

    for key, value in tee_df_by_year.items():
        df_full = df_full.append(value, ignore_index=True)

    for key, value in non_tee_df_by_filename.items():
        df_full = df_full.append(value, ignore_index=True)

    df_full[['year']] = df_full[['year']].astype(int)
    if not subset:
        subset = [u'code', u'institution', u'ressources', u'value', u'year']
    if drop_duplicates:
        df_full.drop_duplicates(subset=subset, inplace=True)
        if year == 2011:
            df_full['value_rounded'] = numpy.around(
                df_full['value'].astype('float64'), 3)
        else:
            df_full['value_rounded'] = numpy.around(
                df_full['value'].astype('float64'), 5)
        df_full = df_full.drop_duplicates(
            ['code', 'institution', 'ressources', 'value_rounded', 'year'])
    return df_full
def cn_df_generator(year, list_years = None, drop_duplicates = True, subset = None):
    """
    Generates the table with all the data from Comptabilite Nationale.

    Parameters
    ----------
    year : int
        year of INSEE data realease
    list_years : list of integers
        list of years of interest. Optional.

    Example
    --------
    >>> year = 2013
    >>> list_years = None
    >>> table2013 = cn_df_generator(2013, list_years = range(1949, 2014))

    Returns the main table of comptabilite nationale data for all years from 1949 to 2013.
    """
    tee_df_by_year = tee_df_by_year_generator(year, list_years)  # arguments: (year, [years_list])
    non_tee_df_by_filename = non_tee_df_by_filename_generator(year)  # arguement: (year)

    df_full = pandas.DataFrame()

    for key, value in tee_df_by_year.items():
        df_full = df_full.append(value, ignore_index = True)

    for key, value in non_tee_df_by_filename.items():
        df_full = df_full.append(value, ignore_index = True)

    df_full[['year']] = df_full[['year']].astype(int)
    if not subset:
        subset = [u'code', u'institution', u'ressources', u'value', u'year']
    if drop_duplicates:
        df_full.drop_duplicates(subset = subset, inplace = True)
        if year == 2011:
            df_full['value_rounded'] = numpy.around(df_full['value'].astype('float64'), 3)
        else:
            df_full['value_rounded'] = numpy.around(df_full['value'].astype('float64'), 5)
        df_full = df_full.drop_duplicates(['code', 'institution', 'ressources', 'value_rounded', 'year'])
    return df_full
def test_duplicate_non_tee_df():
    folder_year = 2013
    non_tee_df_by_filename = parser_non_tee.non_tee_df_by_filename_generator(folder_year)
    for key, df in non_tee_df_by_filename.items():
        for element in df.duplicated():
            assert element == 0, "There are duplicate rows in " + key + ", in folder: comptes_annees " + folder_year