def run_step(self, prev, params): data = prev df = pd.read_excel(data, sheet_name='2.4') df.columns = [ norm(x.strip().lower().replace(' ', '_').replace('-', '_').replace( '%', 'perc')) for x in df.columns ] df = df.loc[~df['entidad_federativa'].str.contains('Total')].copy() # get end_id dimension dim_geo = get_dimensions()[0] df['entidad_federativa'].replace(dict( zip(dim_geo['ent_name'], dim_geo['ent_id'])), inplace=True) df.columns = [ 'ent_id', 'year', 'quarter_id', 'value_between_companies', 'value_new_investments', 'value_re_investments', 'count_between_companies', 'count_new_investments', 'count_re_investments', 'value_between_companies_c', 'value_new_investments_c', 'value_re_investments_c' ] df['quarter_id'] = df['year'].astype(int).astype( str) + df['quarter_id'].astype(int).astype(str) df['quarter_id'] = df['quarter_id'].astype(int) df.drop(columns=[ 'value_between_companies', 'value_new_investments', 'value_re_investments' ], inplace=True) base = ['ent_id', 'year', 'quarter_id'] df_final = pd.DataFrame() for option in [ 'between_companies', 'new_investments', 're_investments' ]: temp = df[base + ['count_{}'.format(option), 'value_{}_c'.format(option) ]].copy() temp.columns = ['ent_id', 'year', 'quarter_id', 'count', 'value_c'] temp.dropna(subset=['value_c'], inplace=True) temp['investment_type'] = option df_final = df_final.append(temp) df = df_final.copy() df['investment_type'].replace(INVESTMENT_TYPE, inplace=True) temp = pd.DataFrame() for ent in list(df['ent_id'].unique()): temp = temp.append( validate_category(df.loc[(df['ent_id'] == ent)], 'investment_type', 'value_c', 'c')) df = temp.copy() df = df.loc[df['value_c'].astype(str).str.lower() != 'c'].copy() df['value_c'] = df['value_c'].astype(float) return df
def run_step(self, prev, params): data = prev result = {} historic = {} last_period = {} for sheet in params.get('sheets'): df = pd.read_excel(data, sheet_name=sheet) df.columns = df.columns.str.strip() df.rename(columns=FDI_COLUMNS, inplace=True) pk_id = [ x for x in df.columns if x in ['sector_id', 'subsector_id', 'industry_group_id'] ][0] df.dropna(subset=[params.get('level')], inplace=True) # get end_id dimension dim_geo, dim_country = get_dimensions() if params.get('level') == 'ent_id': df['ent_id'].replace(dict( zip(dim_geo['ent_name'], dim_geo['ent_id'])), inplace=True) else: df['country_id'].replace(COUNTRY_REPLACE, inplace=True) df['country_id'].replace(dict( zip(dim_country['country_name_es'], dim_country['iso3'])), inplace=True) df['country_id'] = df['country_id'].replace( missing_replacements) # filter "Otros países" df = df.loc[df['country_id'] != 'xxa'].copy() split = df[pk_id].str.split(' ', n=1, expand=True) df[pk_id] = split[0] df[pk_id] = df[pk_id].astype(int) if pk_id == 'sector_id': df['sector_id'].replace(SECTOR_REPLACE, inplace=True) df['sector_id'] = df['sector_id'].astype(str) temp = pd.DataFrame() top_3_historic = pd.DataFrame() top_3_last_period = pd.DataFrame() for ele in list(df[pk_id].unique()): # top 3 acumulan mas IED 1999 - 2020 temp = df.loc[df[pk_id] == ele, [params.get('level'), pk_id, 'value', 'count', 'value_c']] \ .groupby(by=[params.get('level'), pk_id]).sum().reset_index().sort_values(by=['value'], ascending=False)[:3] temp['top'] = range(1, temp.shape[0] + 1) # "C" values temp.loc[temp['count'] < LIMIT_C, 'value'] = 'C' top_3_historic = top_3_historic.append(temp, sort=False) # top 3 acumulan mas IED ultimo periodo temp = df.loc[ df[pk_id] == ele, ['year', params.get('level'), pk_id, 'value', 'count']].groupby( by=['year', params.get('level'), pk_id ]).sum().reset_index() temp = temp.loc[temp['year'] == temp['year'].max()].sort_values( by=['value'], ascending=False)[:3] temp['top'] = range(1, temp.shape[0] + 1) # "C" values temp.loc[temp['count'] < LIMIT_C, 'value'] = 'C' top_3_last_period = top_3_last_period.append(temp, sort=False) if params.get('level') == 'ent_id': top_3_historic['ent_name'] = top_3_historic['ent_id'] top_3_historic['ent_name'].replace(dict( zip(dim_geo['ent_id'], dim_geo['ent_name'])), inplace=True) top_3_historic = top_3_historic[[ 'ent_id', 'ent_name', pk_id, 'value', 'count', 'top' ]].copy() top_3_last_period['ent_name'] = top_3_last_period['ent_id'] top_3_last_period['ent_name'].replace(dict( zip(dim_geo['ent_id'], dim_geo['ent_name'])), inplace=True) top_3_last_period = top_3_last_period[[ 'ent_id', 'ent_name', pk_id, 'value', 'count', 'year' ]].copy() else: top_3_historic['country_name'] = top_3_historic['country_id'] top_3_historic['country_name'].replace(dict( zip(dim_country['iso3'], dim_country['country_name_es'])), inplace=True) top_3_historic['country_name_en'] = top_3_historic[ 'country_id'] top_3_historic['country_name_en'].replace(dict( zip(dim_country['iso3'], dim_country['country_name'])), inplace=True) top_3_historic = top_3_historic[[ 'country_id', 'country_name', 'country_name_en', pk_id, 'value', 'count', 'top' ]].copy() top_3_last_period['country_name'] = top_3_last_period[ 'country_id'] top_3_last_period['country_name'].replace(dict( zip(dim_country['iso3'], dim_country['country_name_es'])), inplace=True) top_3_last_period['country_name_en'] = top_3_last_period[ 'country_id'] top_3_last_period['country_name_en'].replace(dict( zip(dim_country['iso3'], dim_country['country_name'])), inplace=True) top_3_last_period = top_3_last_period[[ 'country_id', 'country_name', 'country_name_en', pk_id, 'value', 'count', 'year' ]].copy() historic[pk_id.split('_id')[0]] = top_3_historic.to_dict( orient='records') last_period[pk_id.split('_id')[0]] = top_3_last_period.to_dict( orient='records') result['top3_industry_ent_historic'] = historic result['top3_industry_ent_last_period'] = last_period with open('{}.json'.format(params.get('file_name')), 'w') as outfile: json.dump(result, outfile) return df
def run_step(self, prev, params): data = prev result = {} historic = {} last_period = {} df = pd.read_excel(data, sheet_name=params.get('sheets')) df.columns = df.columns.str.strip() df.rename(columns=FDI_COLUMNS, inplace=True) pk_id = params.get('level') df.dropna(subset=[params.get('level')], inplace=True) # get end_id dimension dim_geo, dim_country = get_dimensions() if params.get('level') == 'ent_id': df = df.loc[~df['ent_id'].str.contains('Total')].copy() df['ent_id'].replace(dict( zip(dim_geo['ent_name'], dim_geo['ent_id'])), inplace=True) df['ent_id'] = df['ent_id'].astype(int) df['country_id'].replace(COUNTRY_REPLACE, inplace=True) df['country_id'].replace(dict( zip(dim_country['country_name_es'], dim_country['iso3'])), inplace=True) df['country_id'] = df['country_id'].replace(missing_replacements) # filter "Otros países" df = df.loc[df['country_id'] != 'xxa'].copy() df.loc[df['country_id'] == 'Estados Unidos', 'country_id'] = 'usa' df.loc[df['country_id'] == 'Reino Unido', 'country_id'] = 'gbr' df.loc[df['country_id'] == 'Chequia', 'country_id'] = 'cze' # For each country, select top 3 entities that receive FDI temp = pd.DataFrame() top_3_historic = pd.DataFrame() top_3_last_period = pd.DataFrame() for ele in list(df['country_id'].unique()): # top 3 acumulan mas IED 1999 - 2021 temp = df.loc[ df['country_id'] == ele, [pk_id, 'country_id', 'value', 'count', 'value_c']].groupby( by=[pk_id, 'country_id']).sum().reset_index().sort_values( by=['value'], ascending=False)[:3] temp['top'] = range(1, temp.shape[0] + 1) # "C" values temp.loc[temp['count'] < LIMIT_C, 'value'] = 'C' top_3_historic = top_3_historic.append(temp, sort=False) # top 3 acumulan mas IED ultimo periodo temp = df.loc[ df['country_id'] == ele, ['year', pk_id, 'country_id', 'value', 'count', 'value_c' ]].groupby( by=['year', pk_id, 'country_id']).sum().reset_index() temp = temp.loc[temp['year'] == temp['year'].max()].sort_values( by=['value'], ascending=False)[:3] temp['top'] = range(1, temp.shape[0] + 1) # "C" values temp.loc[temp['count'] < LIMIT_C, 'value'] = 'C' top_3_last_period = top_3_last_period.append(temp, sort=False) level_keys = { 'ent_id': [pk_id, 'ent_name'], 'investment_type': [pk_id] } if params.get('level') == 'ent_id': top_3_historic['ent_name'] = top_3_historic['ent_id'] top_3_historic['ent_name'].replace(dict( zip(dim_geo['ent_id'], dim_geo['ent_name'])), inplace=True) top_3_historic = top_3_historic[[ 'country_id', pk_id, 'ent_name', 'value', 'count', 'top' ]].copy() top_3_last_period['ent_name'] = top_3_last_period['ent_id'] top_3_last_period['ent_name'].replace(dict( zip(dim_geo['ent_id'], dim_geo['ent_name'])), inplace=True) top_3_last_period = top_3_last_period[[ 'country_id', pk_id, 'ent_name', 'value', 'count', 'year' ]].copy() top_3_historic['country_name'] = top_3_historic['country_id'] top_3_historic['country_name'].replace(dict( zip(dim_country['iso3'], dim_country['country_name_es'])), inplace=True) top_3_historic['country_name_en'] = top_3_historic['country_id'] top_3_historic['country_name_en'].replace(dict( zip(dim_country['iso3'], dim_country['country_name'])), inplace=True) top_3_historic = top_3_historic[ ['country_id', 'country_name', 'country_name_en'] + level_keys[pk_id] + ['value', 'count', 'top']].copy().reset_index(drop=True) top_3_last_period['country_name'] = top_3_last_period['country_id'] top_3_last_period['country_name'].replace(dict( zip(dim_country['iso3'], dim_country['country_name_es'])), inplace=True) top_3_last_period['country_name_en'] = top_3_last_period['country_id'] top_3_last_period['country_name_en'].replace(dict( zip(dim_country['iso3'], dim_country['country_name'])), inplace=True) top_3_last_period = top_3_last_period[ ['country_id', 'country_name', 'country_name_en'] + level_keys[pk_id] + ['value', 'count', 'year']].copy().reset_index(drop=True) top_3_last_period['year'] = top_3_last_period['year'].astype(int) historic[pk_id.split('_id')[0]] = top_3_historic.to_dict( orient='records') last_period[pk_id.split('_id')[0]] = top_3_last_period.to_dict( orient='records') result['{}_historic'.format(params.get('file_name'))] = historic result['{}_last_period'.format(params.get('file_name'))] = last_period with open('{}.json'.format(params.get('file_name')), 'w') as outfile: json.dump(result, outfile) return result