Beispiel #1
0
revenu_personne.columns = header
revenu_personne.rename(columns={'IRIS': 'CODGEO'}, inplace=True)
# to get real values
revenu_personne = revenu_personne[6:]
# creating new feature : sum of all feature
features = [
    x for x in header if x not in
    ['IRIS', 'LIBIRIS', 'COM', 'LIBCOM', 'REG', 'DEP', 'ARR', 'CV', 'ZE2010']
]  # special list for this file
# No need to sum features here (% and quantile)
features.append('CODGEO')
print "il y a  %d iris différentes pour le revenu par personne et %d features" % (
    len(revenu_personne.CODGEO.unique()), len(features) - 1)

revenu_personne['LIBCOM'] = revenu_personne['LIBCOM'].str.replace(u' - ', u'-')
data = fillna_with_other_table(data, revenu_personne, 'CODGEO')
compare_geo(data, revenu_personne)
data = pd.merge(data, revenu_personne[features], on='CODGEO', how='outer')

## Revenu par unité de consomation
revenu_uc = pd.read_excel(
    'data/RFDU2011IRI.xls',
    sheetname=1)  #using int cause name of sheetname have some "é"
# creating header from file
header = revenu_uc.loc[5].tolist()
revenu_uc.columns = header
revenu_uc.rename(columns={'IRIS': 'CODGEO'}, inplace=True)
# to get real values
revenu_uc = revenu_uc[6:]
# creating new feature : sum of all feature
features = [
Beispiel #2
0
## Revenu par personne
revenu_personne = pd.read_excel('data/RFDP2011IRI.xls', sheetname=1) #using int cause name of sheetname have some "é"
# creating header from file
header = revenu_personne.loc[5].tolist()
revenu_personne.columns = header
revenu_personne.rename(columns={'IRIS':'CODGEO'}, inplace=True)
# to get real values
revenu_personne = revenu_personne[6:]
# creating new feature : sum of all feature
features = [x for x in header if x not in ['IRIS','LIBIRIS','COM','LIBCOM','REG','DEP','ARR','CV','ZE2010']] # special list for this file
# No need to sum features here (% and quantile)
features.append('CODGEO')
print "il y a  %d iris différentes pour le revenu par personne et %d features" % (len(revenu_personne.CODGEO.unique()), len(features) - 1)

revenu_personne['LIBCOM'] = revenu_personne['LIBCOM'].str.replace(u' - ', u'-')
data = fillna_with_other_table(data, revenu_personne, 'CODGEO')
compare_geo(data, revenu_personne)
data = pd.merge(data, revenu_personne[features], on='CODGEO', how='outer')


## Revenu par unité de consomation
revenu_uc = pd.read_excel('data/RFDU2011IRI.xls', sheetname=1) #using int cause name of sheetname have some "é"
# creating header from file
header = revenu_uc.loc[5].tolist()
revenu_uc.columns = header
revenu_uc.rename(columns={'IRIS':'CODGEO'}, inplace=True)
# to get real values
revenu_uc = revenu_uc[6:]
# creating new feature : sum of all feature
features = [x for x in header if x not in ['IRIS','LIBIRIS','COM','LIBCOM','REG','DEP','ARR','CV','ZE2010']] # special list for this file
# No need to sum features here (% and quantile)
Beispiel #3
0
]
# Sum NB_F101 to NB_F118
sport['nb_sport'] = sport[[x for x in sport.columns if x[:5] == 'NB_F1' and len(x) == 7]]\
                    .applymap(lambda x: float(x)).sum(axis=1)
# Sum NB_F101_NB_AIREJEU to NB_F118_NB_AIREJEU
sport['nb_airjeu_sport'] =  sport[[x for x in sport.columns if x[:5] == 'NB_F1' and x[-10:] == 'NB_AIREJEU']]\
                                .applymap(lambda x: float(x)).sum(axis=1)
[features.append(i) for i in ['nb_sport', 'IRIS', 'nb_airjeu_sport']]
print("il y a  %d iris différentes pour le sport et %d features" %
      (len(sport.IRIS.unique()), len(features) - 1))

compare_geo(data, sport)
data = pd.merge(data, sport[features], on='IRIS', how='outer')
# Adding new IRIS, filling geo information with new files
data = fillna_with_other_table(data,
                               sport,
                               'IRIS',
                               columns=['COM', 'DEP', 'REG'])
_check_data(data, "Sport 16")
del sport

## Enseignement 1er degré
enseignement_1 = pd.read_excel('data/equip-serv-ens-1er-degre-infra-2016.xls',
                               sheet_name='IRIS')
# creating header from file
header = enseignement_1.loc[4].tolist()
enseignement_1.columns = header
# to get real values
enseignement_1 = enseignement_1[5:]
# creating new feature : sum all features non aggregated
features = [
    x for x in header if x not in
Beispiel #4
0
sport = sport[5:]
# creating new feature : sum all features non aggregated
features = [x for x in header if x not in ['IRIS','LIB_IRIS','COM','LIB_COM','REG','REG2016','DEP']]
# Sum NB_F101 to NB_F118
sport['nb_sport'] = sport[[x for x in sport.columns if x[:5] == 'NB_F1' and len(x) == 7]]\
                    .applymap(lambda x: float(x)).sum(axis=1)
# Sum NB_F101_NB_AIREJEU to NB_F118_NB_AIREJEU
sport['nb_airjeu_sport'] =  sport[[x for x in sport.columns if x[:5] == 'NB_F1' and x[-10:] == 'NB_AIREJEU']]\
                                .applymap(lambda x: float(x)).sum(axis=1)
[features.append(i) for i in ['nb_sport', 'IRIS', 'nb_airjeu_sport']]
print("il y a  %d iris différentes pour le sport et %d features" % (len(sport.IRIS.unique()), len(features) - 1))

compare_geo(data, sport)
data = pd.merge(data, sport[features], on='IRIS', how='outer')
# Adding new IRIS, filling geo information with new files
data = fillna_with_other_table(data, sport, 'IRIS', columns=['COM', 'DEP', 'REG'])
_check_data(data, "Sport 16")
del sport


## Enseignement 1er degré
enseignement_1 = pd.read_excel('data/equip-serv-ens-1er-degre-infra-2016.xls', sheetname='IRIS')
# creating header from file
header = enseignement_1.loc[4].tolist()
enseignement_1.columns = header
# to get real values
enseignement_1 = enseignement_1[5:]
# creating new feature : sum all features non aggregated
features = [x for x in header if x not in ['IRIS','LIB_IRIS','COM','LIB_COM','REG','REG2016','DEP']]
# Sum NB_C101 to NB_C105
enseignement_1['nb_enseignement_1'] = enseignement_1[[x for x in enseignement_1.columns if x[:2] == 'C1' and len(x) == 4]]\