x for x in header if x not in [ 'CODGEO', 'LIBGEO', 'COM', 'LIBCOM', 'REG', 'DEP', 'ARR', 'CV', 'ZE2010', 'UU2010' ] ] # Sum NB_F101 to NB_F118 sport['nb_sport'] = sport[[x for x in sport.columns if x[:5] == 'NB_F1' and len(x) == 7]]\ .applymap(lambda x: float(x)).sum(axis=1) # Sum NB_F101_NB_AIREJEU to NB_F118_NB_AIREJEU sport['nb_airjeu_sport'] = sport[[x for x in sport.columns if x[:5] == 'NB_F1' and x[-10:] == 'NB_AIREJEU']]\ .applymap(lambda x: float(x)).sum(axis=1) [features.append(i) for i in ['nb_sport', 'CODGEO']] print "il y a %d iris différentes pour le sport et %d features" % (len( sport.CODGEO.unique()), len(features) - 1) compare_geo(data, sport) data = pd.merge(data, sport[features], on='CODGEO', how='outer') ## Enseignement 1er degré enseignement_1 = pd.read_excel('data/equip-serv-ens-1er-degre-infra.xls', sheetname='IRIS') # creating header from file header = enseignement_1.loc[4].tolist() enseignement_1.columns = header # to get real values enseignement_1 = enseignement_1[5:] # creating new feature : sum all features non aggregated features = [ x for x in header if x not in [ 'CODGEO', 'LIBGEO', 'COM', 'LIBCOM', 'REG', 'DEP', 'ARR', 'CV', 'ZE2010', 'UU2010'
header = sport.loc[4].tolist() sport.columns = header # to get real values sport = sport[5:] # creating new feature : sum all features non aggregated features = [x for x in header if x not in ['CODGEO','LIBGEO','COM','LIBCOM','REG','DEP','ARR','CV','ZE2010','UU2010']] # Sum NB_F101 to NB_F118 sport['nb_sport'] = sport[[x for x in sport.columns if x[:5] == 'NB_F1' and len(x) == 7]]\ .applymap(lambda x: float(x)).sum(axis=1) # Sum NB_F101_NB_AIREJEU to NB_F118_NB_AIREJEU sport['nb_airjeu_sport'] = sport[[x for x in sport.columns if x[:5] == 'NB_F1' and x[-10:] == 'NB_AIREJEU']]\ .applymap(lambda x: float(x)).sum(axis=1) [features.append(i) for i in ['nb_sport', 'CODGEO']] print "il y a %d iris différentes pour le sport et %d features" % (len(sport.CODGEO.unique()), len(features) - 1) compare_geo(data, sport) data = pd.merge(data, sport[features], on='CODGEO', how='outer') ## Enseignement 1er degré enseignement_1 = pd.read_excel('data/equip-serv-ens-1er-degre-infra.xls', sheetname='IRIS') # creating header from file header = enseignement_1.loc[4].tolist() enseignement_1.columns = header # to get real values enseignement_1 = enseignement_1[5:] # creating new feature : sum all features non aggregated features = [x for x in header if x not in ['CODGEO','LIBGEO','COM','LIBCOM','REG','DEP','ARR','CV','ZE2010','UU2010']] # Sum NB_C101 to NB_C105 enseignement_1['nb_enseignement_1'] = enseignement_1[[x for x in enseignement_1.columns if x[:5] == 'NB_C1' and len(x) == 7]]\ .applymap(lambda x: float(x)).sum(axis=1)