def run_step(self, prev, params): df = prev # replace members in dimensions df['sex'].replace(SEX, inplace=True) df['person_type'] = df['person_type'].apply(lambda x: norm(x)).str.lower() df['person_type'].replace(PERSON_TYPE, inplace=True) df['age_range'].replace(AGE_RANGE, inplace=True) for col in ['ent_id', 'mun_id']: df[col] = df[col].apply(lambda x: norm(x)).str.upper() # replace missing municipalities df['mun_id'].replace(MISSING_MUN, inplace=True) # replace ent df['ent_id'].replace({'MEXICO': 15}, inplace=True) # replace names for ids ent, mun = replace_geo() df['ent_id'] = df['ent_id'].replace(ent) df['mun_id'] = df['mun_id'].replace(mun) df.loc[~df['mun_id'].isin(list(mun.values())), 'mun_id'] = \ df.loc[~df['mun_id'].isin(list(mun.values())), 'ent_id'].astype(str) + '999' df = df[['mun_id', 'level', 'sex', 'person_type', 'age_range', 'count']].copy() for col in df.columns[df.columns != 'level']: df[col] = df[col].astype(int) return df
def run_step(self, prev, params): df = prev # filter confidential values df = df.loc[df['count'].astype(str).str.lower() != 'c'].copy() for col in ['sex', 'age_range']: df[col] = df[col].replace({'c': 0}) # replace members in dimensions df['person_type'] = df['person_type'].str.strip().str.lower().apply( lambda x: norm(x)) df['sex'].replace(SEX, inplace=True) df['person_type'].replace(PERSON_TYPE, inplace=True) df['age_range'].replace(AGE_RANGE, inplace=True) df.drop(columns=['company_size'], inplace=True) df.loc[df['ent_id'].isna(), 'ent_id'] = '33' df.loc[df['mun_id'].isna(), 'mun_id'] = '33000' for col in ['ent_id', 'mun_id']: df[col] = df[col].apply(lambda x: norm(x)).str.upper() # replace missing municipalities df['mun_id'].replace(MISSING_MUN, inplace=True) # replace ent df['ent_id'].replace({'MEXICO': 15}, inplace=True) # replace entity names for ids ent, mun = replace_geo() df['ent_id'] = df['ent_id'].replace(ent) # variable clave denotes "mun_id" df = df.drop("mun_id", axis=1) df = df.rename(columns={"clave": "mun_id"}) df.loc[df['mun_id'].isin(df.ent_id.unique()), 'mun_id'] = \ df.loc[df['mun_id'].isin(df.ent_id.unique()), 'ent_id'].astype(str) + '999' df = df[[ 'ent_id', 'mun_id', 'sex', 'person_type', 'age_range', 'count', 'level' ]].copy() for col in df.columns[df.columns != 'level']: try: df[col] = df[col].astype(int) except ValueError: print('Column {} to float type'.format(col)) df[col] = df[col].astype(float) return df