コード例 #1
0
ファイル: make_data.py プロジェクト: tiphaine/o2-base
def make_inflation_country():
    """
    Formats worldwide inflation data by country (source World Bank). Reads the
    information location and outputs in the `source_config.py` file.
    """
    print('>> Handling latest "WORLD BANK / inflation / country" data')
    raw_inflation_file = source_config.inflation_data_files['raw']['latest']
    inflation_processed_file = source_config.inflation_data_files['processed'][
        'latest']
    inflation_data_raw = pd.read_excel(raw_inflation_file, skiprows=3)
    inflation_data_list = [[
        'country', 'indicator_type', 'year', 'indicator_value'
    ]]
    for _, row in inflation_data_raw.iterrows():
        country = row['Country Name']
        for year in range(1960, 2030):
            try:
                inflation_value = row[str(year)]
                if ~np.isnan(inflation_value):
                    inflation_data_list.append([
                        country, 'world_bank_inflation', year, inflation_value
                    ])
            except KeyError:
                pass
    inflation_data = pd.DataFrame(inflation_data_list[1:],
                                  columns=inflation_data_list[0])
    write_excel_file(inflation_data, inflation_processed_file)
コード例 #2
0
def make_insee_diplome_formation(decoupage_geo=None):
    """
    Collects and formats 'diplome formation' data for France (source:
    INSEE). Reads the information location and outputs in the `source_config.py`
    file.
    """
    if decoupage_geo is None:
        decoupage_geo = 'commune'
    output_files = {}
    print('>> Handling "INSEE diplome formation" data for year...')
    for year, file_paths in source_config.diplome_formation_files[
            decoupage_geo].items():
        print('\t- {}'.format(year))
        output_files[year] = {}
        raw_cols = []
        raw_file = file_paths['raw']
        for item in pd.read_excel(raw_file, skiprows=4).columns:
            raw_cols.append(_insee_format_column_name(item, year))
        raw_data = pd.read_excel(raw_file, skiprows=5)
        raw_data.columns = raw_cols
        raw_data['year'] = year
        for col_prefix in ('pop', 'hommes', 'femmes'):
            col_list = [col for col in raw_cols if col.startswith(col_prefix)]
            res = defaultdict(list)
            res[col_prefix].append([
                'year', 'code_geographique', 'region', 'departement',
                'libelle_geographique'
            ] + col_list)
            for index, row in raw_data.iterrows():
                line_base = [
                    row['year'],
                    row['code_geographique'],
                    row['region'],
                    row['departement'],
                    row['libelle_geographique'],
                ]
                for col_name in col_list:
                    line_base.append(row[col_name])
                res[col_prefix].append(line_base)
            output_data = pd.DataFrame(res[col_prefix][1:],
                                       columns=res[col_prefix][0])
            diplome_formation_files_prefix = source_config.diplome_formation_files[
                decoupage_geo][year]['processed']
            output_file = os.path.join(
                diplome_formation_files_prefix,
                'insee_diplome_formation_{}_commune_{}.xls'.format(
                    col_prefix, year))
            write_excel_file(output_data, output_file=output_file)
            output_files[year][col_prefix] = output_file
    return output_files
コード例 #3
0
def make_insee_couple_famille_menages(decoupage_geo=None):
    """
    Collects and formats 'couple famille menages' data for France (source:
    INSEE). Reads the information location and outputs in the `source_config.py`
    file.
    """
    if decoupage_geo is None:
        decoupage_geo = 'commune'
    for year, file_paths in source_config.couple_famille_menages_files[
            decoupage_geo].items():
        raw_cols = []
        raw_file = file_paths['raw']
        for item in pd.read_excel(raw_file, skiprows=4).columns:
            raw_cols.append(_insee_format_column_name(item, year))
        raw_data = pd.read_excel(raw_file, skiprows=5)
        raw_data.columns = raw_cols
        raw_data['year'] = year
        for col_prefix in ('men', 'pop', 'fam'):
            col_list = [col for col in raw_cols if col.startswith(col_prefix)]
            res = defaultdict(list)
            for col_name in col_list:
                for index, row in raw_data.iterrows():
                    res[col_name].append([
                        row['year'], row['code_geographique'], row['region'],
                        row['departement'], row['libelle_geographique'],
                        col_name, row[col_name]
                    ])
                output_data = pd.DataFrame(res[col_name],
                                           columns=[
                                               'year', 'code_geographique',
                                               'region', 'departement',
                                               'libelle_geographique',
                                               'indicator_type',
                                               'indicator_value'
                                           ])
                couple_famille_menages_processed_file_prefix = \
                    source_config.couple_famille_menages_files[
                        decoupage_geo][year]['processed']
                output_file = os.path.join(
                    couple_famille_menages_processed_file_prefix,
                    'insee_couple_famille_menages_commune_{}_{}.xls'.format(
                        col_name, year))
                write_excel_file(output_data, output_file=output_file)
コード例 #4
0
def make_confiance_menages():
    """
    Collects and formats confiance des menages data for France  (source INSEE).
    Reads the information location and outputs in the `source_config.py` file.
    """
    print('>> Handling "INSEE Confiance Menages" data for year...')
    raw_cols = [
        'date', 'confiance_menage_synthetique', 'niveau_vie_passe',
        'niveau_vie_perspective', 'chomage_perspective', 'prix_passe',
        'prix_perspective', 'opportunite_achat_important',
        'opportunite_epargne', 'epargne_capa_actuelle', 'finance_pers_passe',
        'finance_pers_perspective', 'epargne_capa_perspective'
    ]
    latest_year = str(
        max([
            int(item)
            for item in source_config.confiance_data_url['menage'].keys()
        ]))
    latest_month = max([
        int(item) for item in source_config.confiance_data_url['menage']
        [latest_year].keys()
    ])
    raw_confiance_file = source_config.confiance_data_raw_file['menage'][
        latest_year][latest_month]
    confiance_raw = pd.read_excel(raw_confiance_file, skiprows=5)
    confiance_raw.columns = raw_cols
    confiance_processed_file_prefix = source_config.confiance_data_processed_file[
        'menage'][latest_year][latest_month]
    res = defaultdict(list)
    for col_name in raw_cols[1:]:
        for index, row in confiance_raw.iterrows():
            date_year = pd.to_datetime(row['date']).year
            date_month = pd.to_datetime(row['date']).month
            res[col_name].append(
                [date_year, date_month, col_name, row[col_name]])
        output_data = pd.DataFrame(
            res[col_name],
            columns=['year', 'month', 'indicator_type', 'indicator_value'])
        output_file = os.path.join(confiance_processed_file_prefix,
                                   'insee_confiance_{}.xls'.format(col_name))
        write_excel_file(output_data, output_file=output_file)
コード例 #5
0
def make_forex_euros():
    """
    Formats worldwide forex to euros data by country (source ECB). Reads the
    information location and outputs in the `source_config.py` file.
    """
    print('>> Handling latest "EUROPEAN CENTRAL BANK / forex to euros" data')
    raw_forex_eur_file = source_config.forex_data_files['latest']['raw']
    forex_eur_processed_dir = source_config.forex_data_files['latest'][
        'processed']
    forex_eur_data_raw = pd.read_csv(raw_forex_eur_file)

    for currency in [
            curr for curr in forex_eur_data_raw.columns
            if curr not in 'Date' and not curr.startswith('Unnamed')
    ]:
        forex_eur_data_list = [[
            'currency', 'indicator_type', 'year', 'month', 'day',
            'indicator_value'
        ]]
        for _, row in forex_eur_data_raw.iterrows():
            date_raw = row['Date']
            date_tokens = date_raw.split('-')
            date_year = date_tokens[0]
            date_month = date_tokens[1]
            date_day = date_tokens[2]
            forex_currency = row[currency]
            forex_eur_data_list.append([
                currency, 'forex_eur_{}'.format(currency.lower()), date_year,
                date_month, date_day, forex_currency
            ])
        forex_eur_data = pd.DataFrame(forex_eur_data_list[1:],
                                      columns=forex_eur_data_list[0])
        write_excel_file(
            forex_eur_data,
            os.path.join(forex_eur_processed_dir,
                         'forex_eur_{}.xls'.format(currency.lower())))
コード例 #6
0
def make_climat_affaires_batiment():
    """
    Collects and formats climat des affaires / batiment for France (source:
    INSEE). Reads the information location and outputs in the `source_config.py`
    file.
    """
    print(
        '>> Handling "INSEE Climat des Affaires / Batiment" data for year...')
    raw_cols = [
        'date',
        'retournement',
        'affaires_batiment_synthetique',
        'activite_passe',
        'activite_passe_clientele_publique',
        'activite_passe_clientele_privee',
        'activite_passe_logement_neuf',
        'activite_passe_neuf_hors_logement',
        'activite_passe_entretien_amelioration',
        'activite_prevue',
        'activite_prevue_clientele_publique',
        'activite_prevue_clientele_privee',
        'activite_prevue_logement_neuf',
        'activite_prevue_hors_logement_neuf',
        'activite_prevue_entretien_amelioration',
        'jugement_carnet_commande',
        'carnet_commande_par_mois',
        'perspective_generale_activite',
        'tx_utilisation_capa_production',
        'entreprise_sans_accroiss_production',
        'entreprise_sans_accroiss_production_insuf_mat',
        'entreprise_sans_accroiss_production_insuf_pers',
        'entreprise_sans_accroiss_production_insuf_approv',
        'entreprise_sans_accroiss_production_cond_climat',
        'entreprise_difficulte_recrutement',
        'tendance_effectif_passe',
        'tendance_effectif_prevu',
        'evolution_prevue_prix',
        'situation_tresorerie',
        'delais_paiement',
        'delais_paiement_client_public',
        'delais_paiement_client_prive',
    ]
    latest_year = str(
        max([
            int(item)
            for item in source_config.affaires_files['batiment'].keys()
        ]))
    raw_aff_bat_file = source_config.affaires_files['batiment'][latest_year][
        'raw']
    print(raw_aff_bat_file)
    aff_bat_raw = pd.read_excel(raw_aff_bat_file,
                                sheet_name='Ensemble',
                                skiprows=6)
    print(aff_bat_raw.shape)
    aff_bat_raw.columns = raw_cols
    aff_bat_processed_file_prefix = source_config.affaires_files['batiment'][
        latest_year]['processed']
    res = defaultdict(list)
    for col_name in raw_cols[1:]:
        for index, row in aff_bat_raw.iterrows():
            month_raw, date_year = row['date'].split()
            date_month = month_abr_fr_to_number[month_raw.replace('.', '')]
            res[col_name].append(
                [date_year, date_month, col_name, row[col_name]])
        output_data = pd.DataFrame(
            res[col_name],
            columns=['year', 'month', 'indicator_type', 'indicator_value'])
        output_file = os.path.join(
            aff_bat_processed_file_prefix,
            'insee_affaires_batiment_{}.xls'.format(col_name))
        write_excel_file(output_data, output_file=output_file)