def make_inflation_country(): """ Formats worldwide inflation data by country (source World Bank). Reads the information location and outputs in the `source_config.py` file. """ print('>> Handling latest "WORLD BANK / inflation / country" data') raw_inflation_file = source_config.inflation_data_files['raw']['latest'] inflation_processed_file = source_config.inflation_data_files['processed'][ 'latest'] inflation_data_raw = pd.read_excel(raw_inflation_file, skiprows=3) inflation_data_list = [[ 'country', 'indicator_type', 'year', 'indicator_value' ]] for _, row in inflation_data_raw.iterrows(): country = row['Country Name'] for year in range(1960, 2030): try: inflation_value = row[str(year)] if ~np.isnan(inflation_value): inflation_data_list.append([ country, 'world_bank_inflation', year, inflation_value ]) except KeyError: pass inflation_data = pd.DataFrame(inflation_data_list[1:], columns=inflation_data_list[0]) write_excel_file(inflation_data, inflation_processed_file)
def make_insee_diplome_formation(decoupage_geo=None): """ Collects and formats 'diplome formation' data for France (source: INSEE). Reads the information location and outputs in the `source_config.py` file. """ if decoupage_geo is None: decoupage_geo = 'commune' output_files = {} print('>> Handling "INSEE diplome formation" data for year...') for year, file_paths in source_config.diplome_formation_files[ decoupage_geo].items(): print('\t- {}'.format(year)) output_files[year] = {} raw_cols = [] raw_file = file_paths['raw'] for item in pd.read_excel(raw_file, skiprows=4).columns: raw_cols.append(_insee_format_column_name(item, year)) raw_data = pd.read_excel(raw_file, skiprows=5) raw_data.columns = raw_cols raw_data['year'] = year for col_prefix in ('pop', 'hommes', 'femmes'): col_list = [col for col in raw_cols if col.startswith(col_prefix)] res = defaultdict(list) res[col_prefix].append([ 'year', 'code_geographique', 'region', 'departement', 'libelle_geographique' ] + col_list) for index, row in raw_data.iterrows(): line_base = [ row['year'], row['code_geographique'], row['region'], row['departement'], row['libelle_geographique'], ] for col_name in col_list: line_base.append(row[col_name]) res[col_prefix].append(line_base) output_data = pd.DataFrame(res[col_prefix][1:], columns=res[col_prefix][0]) diplome_formation_files_prefix = source_config.diplome_formation_files[ decoupage_geo][year]['processed'] output_file = os.path.join( diplome_formation_files_prefix, 'insee_diplome_formation_{}_commune_{}.xls'.format( col_prefix, year)) write_excel_file(output_data, output_file=output_file) output_files[year][col_prefix] = output_file return output_files
def make_insee_couple_famille_menages(decoupage_geo=None): """ Collects and formats 'couple famille menages' data for France (source: INSEE). Reads the information location and outputs in the `source_config.py` file. """ if decoupage_geo is None: decoupage_geo = 'commune' for year, file_paths in source_config.couple_famille_menages_files[ decoupage_geo].items(): raw_cols = [] raw_file = file_paths['raw'] for item in pd.read_excel(raw_file, skiprows=4).columns: raw_cols.append(_insee_format_column_name(item, year)) raw_data = pd.read_excel(raw_file, skiprows=5) raw_data.columns = raw_cols raw_data['year'] = year for col_prefix in ('men', 'pop', 'fam'): col_list = [col for col in raw_cols if col.startswith(col_prefix)] res = defaultdict(list) for col_name in col_list: for index, row in raw_data.iterrows(): res[col_name].append([ row['year'], row['code_geographique'], row['region'], row['departement'], row['libelle_geographique'], col_name, row[col_name] ]) output_data = pd.DataFrame(res[col_name], columns=[ 'year', 'code_geographique', 'region', 'departement', 'libelle_geographique', 'indicator_type', 'indicator_value' ]) couple_famille_menages_processed_file_prefix = \ source_config.couple_famille_menages_files[ decoupage_geo][year]['processed'] output_file = os.path.join( couple_famille_menages_processed_file_prefix, 'insee_couple_famille_menages_commune_{}_{}.xls'.format( col_name, year)) write_excel_file(output_data, output_file=output_file)
def make_confiance_menages(): """ Collects and formats confiance des menages data for France (source INSEE). Reads the information location and outputs in the `source_config.py` file. """ print('>> Handling "INSEE Confiance Menages" data for year...') raw_cols = [ 'date', 'confiance_menage_synthetique', 'niveau_vie_passe', 'niveau_vie_perspective', 'chomage_perspective', 'prix_passe', 'prix_perspective', 'opportunite_achat_important', 'opportunite_epargne', 'epargne_capa_actuelle', 'finance_pers_passe', 'finance_pers_perspective', 'epargne_capa_perspective' ] latest_year = str( max([ int(item) for item in source_config.confiance_data_url['menage'].keys() ])) latest_month = max([ int(item) for item in source_config.confiance_data_url['menage'] [latest_year].keys() ]) raw_confiance_file = source_config.confiance_data_raw_file['menage'][ latest_year][latest_month] confiance_raw = pd.read_excel(raw_confiance_file, skiprows=5) confiance_raw.columns = raw_cols confiance_processed_file_prefix = source_config.confiance_data_processed_file[ 'menage'][latest_year][latest_month] res = defaultdict(list) for col_name in raw_cols[1:]: for index, row in confiance_raw.iterrows(): date_year = pd.to_datetime(row['date']).year date_month = pd.to_datetime(row['date']).month res[col_name].append( [date_year, date_month, col_name, row[col_name]]) output_data = pd.DataFrame( res[col_name], columns=['year', 'month', 'indicator_type', 'indicator_value']) output_file = os.path.join(confiance_processed_file_prefix, 'insee_confiance_{}.xls'.format(col_name)) write_excel_file(output_data, output_file=output_file)
def make_forex_euros(): """ Formats worldwide forex to euros data by country (source ECB). Reads the information location and outputs in the `source_config.py` file. """ print('>> Handling latest "EUROPEAN CENTRAL BANK / forex to euros" data') raw_forex_eur_file = source_config.forex_data_files['latest']['raw'] forex_eur_processed_dir = source_config.forex_data_files['latest'][ 'processed'] forex_eur_data_raw = pd.read_csv(raw_forex_eur_file) for currency in [ curr for curr in forex_eur_data_raw.columns if curr not in 'Date' and not curr.startswith('Unnamed') ]: forex_eur_data_list = [[ 'currency', 'indicator_type', 'year', 'month', 'day', 'indicator_value' ]] for _, row in forex_eur_data_raw.iterrows(): date_raw = row['Date'] date_tokens = date_raw.split('-') date_year = date_tokens[0] date_month = date_tokens[1] date_day = date_tokens[2] forex_currency = row[currency] forex_eur_data_list.append([ currency, 'forex_eur_{}'.format(currency.lower()), date_year, date_month, date_day, forex_currency ]) forex_eur_data = pd.DataFrame(forex_eur_data_list[1:], columns=forex_eur_data_list[0]) write_excel_file( forex_eur_data, os.path.join(forex_eur_processed_dir, 'forex_eur_{}.xls'.format(currency.lower())))
def make_climat_affaires_batiment(): """ Collects and formats climat des affaires / batiment for France (source: INSEE). Reads the information location and outputs in the `source_config.py` file. """ print( '>> Handling "INSEE Climat des Affaires / Batiment" data for year...') raw_cols = [ 'date', 'retournement', 'affaires_batiment_synthetique', 'activite_passe', 'activite_passe_clientele_publique', 'activite_passe_clientele_privee', 'activite_passe_logement_neuf', 'activite_passe_neuf_hors_logement', 'activite_passe_entretien_amelioration', 'activite_prevue', 'activite_prevue_clientele_publique', 'activite_prevue_clientele_privee', 'activite_prevue_logement_neuf', 'activite_prevue_hors_logement_neuf', 'activite_prevue_entretien_amelioration', 'jugement_carnet_commande', 'carnet_commande_par_mois', 'perspective_generale_activite', 'tx_utilisation_capa_production', 'entreprise_sans_accroiss_production', 'entreprise_sans_accroiss_production_insuf_mat', 'entreprise_sans_accroiss_production_insuf_pers', 'entreprise_sans_accroiss_production_insuf_approv', 'entreprise_sans_accroiss_production_cond_climat', 'entreprise_difficulte_recrutement', 'tendance_effectif_passe', 'tendance_effectif_prevu', 'evolution_prevue_prix', 'situation_tresorerie', 'delais_paiement', 'delais_paiement_client_public', 'delais_paiement_client_prive', ] latest_year = str( max([ int(item) for item in source_config.affaires_files['batiment'].keys() ])) raw_aff_bat_file = source_config.affaires_files['batiment'][latest_year][ 'raw'] print(raw_aff_bat_file) aff_bat_raw = pd.read_excel(raw_aff_bat_file, sheet_name='Ensemble', skiprows=6) print(aff_bat_raw.shape) aff_bat_raw.columns = raw_cols aff_bat_processed_file_prefix = source_config.affaires_files['batiment'][ latest_year]['processed'] res = defaultdict(list) for col_name in raw_cols[1:]: for index, row in aff_bat_raw.iterrows(): month_raw, date_year = row['date'].split() date_month = month_abr_fr_to_number[month_raw.replace('.', '')] res[col_name].append( [date_year, date_month, col_name, row[col_name]]) output_data = pd.DataFrame( res[col_name], columns=['year', 'month', 'indicator_type', 'indicator_value']) output_file = os.path.join( aff_bat_processed_file_prefix, 'insee_affaires_batiment_{}.xls'.format(col_name)) write_excel_file(output_data, output_file=output_file)