def load_libelles_emploi_data(decennie=None, debug=False, force_recreate=False): assert decennie is not None libemploi_h5 = os.path.join(libelles_emploi_directory, 'libemploi_{}.h5'.format(decennie)) if os.path.exists(libemploi_h5) and not force_recreate: libemplois = pd.read_hdf(libemploi_h5, 'libemploi') log.info("Libellés emploi read from {}".format(libemploi_h5)) else: libemploi = get_careers(variable='libemploi', decennie=decennie, debug=debug) statut = get_careers(variable='statut', decennie=decennie, debug=debug) libemploi = (libemploi.merge( statut.query("statut in ['T', 'H']"), how='inner', )) libemploi['libemploi_slugified'] = libemploi.libemploi.apply( slugify, separator="_") libemploi.rename(columns=dict(statut='versant'), inplace=True) libemplois = libemploi.groupby( [u'annee', u'versant'])['libemploi_slugified'].value_counts() log.info( "Generating and saving libellés emploi to {}".format(libemploi_h5)) libemplois.to_hdf(libemploi_h5, 'libemploi') libemplois = libemplois.loc[2006:2014, ] return libemplois
def load_libelles(data_path=None, debug=False): libemploi = get_careers(variable='libemploi', data_path=data_path, debug=debug) lib_cir = get_careers(variable='lib_cir', data_path=data_path, debug=debug) lib_netneh = get_careers(variable='lib_netneh', data_path=data_path, debug=debug) libemploi['libemploi_slugified'] = libemploi.libemploi.apply(slugify, separator="_") statut = get_careers(variable='f_coll', data_path=data_path, debug=debug) statut['statut'] = statut['f_coll'] libemploi = libemploi.merge(lib_cir, how='inner', on=["ident", "annee"]) libemploi = libemploi.merge(lib_netneh, how='inner', on=["ident", "annee"]) libemploi = (libemploi.merge( statut.query("statut in ['T', 'H']"), how='inner', )) # Condition de filtrage des libellés cf. Isabelle Bridenne: # " il faut sélectionner if lib_cir_AAAA^= « » and lib_netneh= « » ; ce sont les cas non redressés." filtered_libemploi = libemploi.loc[(libemploi.lib_cir != '') & (libemploi.lib_netneh == '')] filtered_libemploi = filtered_libemploi.loc[:, [ "ident", "libemploi", "annee", "libemploi_slugified" ]] return filtered_libemploi
def build_destinations_dataframes(decennie = None): carrieres = get_careers(variable = 'c_netneh', decennie = decennie).sort_values(['ident', 'annee']) carrieres = carrieres.query('annee > 2010') destinations = get_destinations_dataframe(carrieres) purged_carrieres = clean_empty_netneh(get_careers(variable = 'c_netneh', decennie = decennie) .query('annee > 2010') .sort_values(['ident', 'annee']) ) purged_destinations = get_destinations_dataframe(purged_carrieres) return destinations, purged_destinations
def load_libelles(decennie = 1980, debug = False, year = 2014): libemploi = get_careers(variable = 'libemploi', decennie = decennie, debug = debug) libemploi['libemploi_slugified'] = libemploi.libemploi.apply(slugify, separator = "_") statut = get_careers(variable = 'statut', decennie = decennie, debug = debug) libemploi = (libemploi .merge( statut, how = 'inner', ) ) libemploi = libemploi[libemploi.annee >= year] return libemploi
def load_libelles(decennie=None, debug=False): libemploi = get_careers(variable='libemploi', decennie=decennie, debug=debug) libemploi['libemploi_slugified'] = libemploi.libemploi.apply(slugify, separator="_") statut = get_careers(variable='statut', decennie=decennie, debug=debug) libemploi = (libemploi.merge( statut.query("statut in ['T', 'H']"), how='inner', )) libemploi = libemploi[libemploi.libemploi != ''] return libemploi
def build_transitions_pct_by_grade_initial(decennie = 1970): """Compute the distribution of the number of transitions condtionnal of the first grade""" carrieres = get_careers(variable = 'c_netneh', decennie = decennie).sort_values(['ident', 'annee']) def get_transitions_grade_init(carrieres): df = pd.DataFrame() selection = carrieres.c_netneh.shift().notnull() & (carrieres.ident == carrieres.shift().ident) df['ident'] = carrieres.ident[selection] premiere_annee = carrieres.annee.min() # analysis:ignore premier_grade = carrieres.query('annee == @premiere_annee')[['ident', 'c_netneh']] df['transition'] = (carrieres.c_netneh != carrieres.c_netneh.shift())[selection] transitions = (df .merge(premier_grade, on = 'ident', how = 'left') .groupby(['ident', 'c_netneh'])['transition'] .sum() .astype(int) .reset_index() ) result = pd.DataFrame() result['population'] = (transitions .groupby('c_netneh')['transition'] .value_counts() .sort_values(ascending = False) .cumsum() ) result['cdf'] = result.population / result.population.max() return result return pd.concat(dict( annees_2010_2014 = get_transitions_grade_init(carrieres), annees_2011_2014 = get_transitions_grade_init(carrieres[carrieres.annee != 2010]), annees_2010_2014_purgees = get_transitions_grade_init(clean_empty_netneh(carrieres)), annees_2011_2014_purgees = get_transitions_grade_init(clean_empty_netneh(carrieres.query('annee != 2010'))), ))
def build_destinations_by_grade(decennie = None): """Compute the distribution of the number of destination grades""" carrieres = get_careers(variable = 'c_netneh', decennie = decennie).sort_values(['ident', 'annee']) return pd.concat(dict( annees_2010_2014 = get_destinations_by_grade(carrieres), annees_2011_2014 = get_destinations_by_grade(carrieres[carrieres.annee != 2010]), annees_2010_2014_purgees = get_destinations_by_grade(clean_empty_netneh(carrieres)), annees_2011_2014_purgees = get_destinations_by_grade(clean_empty_netneh(carrieres.query('annee != 2010'))), ))
def merge_libelles_emploi_data(debug=True, decennie): libemploi_h5 = os.path.join(libelles_emploi_tmp_directory, 'libemploi_{}.h5') list_decennie = [1950, 1970] for decennie in list_decennie: libemploi = get_careers(variable='libemploi', decennie=decennie, debug=debug) statut = get_careers(variable='statut', decennie=decennie, debug=debug) libemploi = (libemploi.merge( statut.query("statut in ['T', 'H']"), how='inner', )) libemploi['libemploi_slugified'] = libemploi.libemploi.apply( slugify, separator="_") libemploi.rename(columns=dict(statut='versant'), inplace=True) libemplois = libemploi.groupby( [u'annee', u'versant'])['libemploi_slugified'].value_counts() log.info( "Generating and saving libellés emploi to {}".format(libemploi_h5)) libemplois.to_hdf(libemploi_h5, 'libemploi') return libemplois
def build_transitions_pct_by_echantillon(decennie = 1970): """Compute the distribution of the number of transitions""" carrieres = get_careers(variable = 'c_netneh', decennie = decennie).sort_values(['ident', 'annee']) def get_transitions(carrieres, normalize = True): transitions = pd.DataFrame() selection = carrieres.c_netneh.shift().notnull() & (carrieres.ident == carrieres.shift().ident) transitions['ident'] = carrieres.ident[selection] transitions['transition'] = (carrieres.c_netneh != carrieres.c_netneh.shift())[selection] return transitions.groupby('ident').sum().astype(int).squeeze().value_counts(normalize = normalize) return pd.concat(dict( annees_2010_2014 = get_transitions(carrieres), annees_2011_2014 = get_transitions(carrieres[carrieres.annee != 2010]), annees_2010_2014_purgees = get_transitions(clean_empty_netneh(carrieres)), # 110 transitions, annees_2011_2014_purgees = get_transitions(clean_empty_netneh(carrieres.query('annee != 2010'))), # 390701 )).reset_index().rename( columns = { 'level_0': 'echantillon', 'level_1': 'transitions', 'transition': 'population', } ).sort_values(['echantillon', 'transitions'])
from sas7bdat import SAS7BDAT from fonction_publique.base import raw_directory_path, clean_directory_path, get_careers, parser from fonction_publique.merge_careers_and_legislation import get_grilles libelles_emploi_directory = parser.get('correspondances', 'libelles_emploi_directory') table_cdc_path = "M:/CNRACL/correspondance/LS/2017_lib2014/2017_lib2014.xlsx" # Original libellés decennies = [1950, 1960, 1970, 1980, 1990] for decennie in decennies: print("Processing decennie {}".format(decennie)) libemploi = get_careers(variable = 'libemploi', decennie = decennie, debug = False) if decennie == decennies[0]: libemploi_all = libemploi[libemploi.annee == 2014] else: libemploi_all = libemploi_all.append(libemploi[libemploi.annee == 2014]) libemplois2014 = libemploi_all.libemploi.value_counts() libemplois2014 = pd.DataFrame({'libemploi':libemplois2014.index, 'nb_obs':libemplois2014.values}) # Table CDC table_cdc = pd.read_excel(table_cdc_path, encoding = 'utf-8') table_cdc = table_cdc.sort('nb_obs', ascending=False) # Merge des deux bases de libellés data1 = table_cdc.rename(columns={'lib_cir_2014': 'lib_cir_2014', 'nb_obs': 'nb_obs_CDC'}) data1 = data1[['lib_cir_2014', 'nb_obs_CDC']] data1.lib_cir_2014[data1.lib_cir_2014.isnull()] = ""
assert valid_data_frame, 'The correspondace data frame is not valid' libemploi_grade = (libemploi .merge( correspondance_data_frame, how = 'left', left_on = ['statut', 'annee', 'libemploi_slugified'], right_on = ['versant', 'annee', 'libelle'], ) .drop('libelle', axis = 1) ) len(set(libemploi_grade.grade)) # 2. Adding code netneh netneh = get_careers(variable = 'c_netneh', decennie = 1980, debug = debug) netneh = netneh[netneh.annee==2014] assert (len(netneh.ident) == len(libemploi_grade.ident)) final_merge = (libemploi_grade .merge( netneh, how = 'inner', on = ['ident', 'annee'], ) ) check_list = ['ADJOINT ADMINIST HOSP 2EME CL (E03)', 'INFIRMIER DE CLASSE SUPERIEURE(*)']
from __future__ import division from fonction_publique.base import get_careers decennie = 1950 debug = False libemploi = get_careers(variable='libemploi', decennie=decennie, debug=debug, where="annee >= 2000") print libemploi.annee.value_counts() statut = get_careers(variable='statut', decennie=decennie, debug=debug, where="annee >= 2000") print statut.annee.value_counts() print statut.statut.value_counts() libemploi = (libemploi.merge( statut.query("statut in ['T', 'H']"), how='inner', )) libemploi_annee = libemploi.query('annee == 2000') for statut_i in libemploi_annee.statut.unique(): print statut_i print libemploi_annee.query('statut == @statut_i').libemploi.value_counts()