Beispiel #1
0
    def run_step(self, prev, params):
        data = prev
        df = pd.read_excel(data, sheet_name='2.4')
        df.columns = [
            norm(x.strip().lower().replace(' ', '_').replace('-', '_').replace(
                '%', 'perc')) for x in df.columns
        ]
        df = df.loc[~df['entidad_federativa'].str.contains('Total')].copy()

        # get end_id dimension
        dim_geo = get_dimensions()[0]

        df['entidad_federativa'].replace(dict(
            zip(dim_geo['ent_name'], dim_geo['ent_id'])),
                                         inplace=True)

        df.columns = [
            'ent_id', 'year', 'quarter_id', 'value_between_companies',
            'value_new_investments', 'value_re_investments',
            'count_between_companies', 'count_new_investments',
            'count_re_investments', 'value_between_companies_c',
            'value_new_investments_c', 'value_re_investments_c'
        ]

        df['quarter_id'] = df['year'].astype(int).astype(
            str) + df['quarter_id'].astype(int).astype(str)
        df['quarter_id'] = df['quarter_id'].astype(int)

        df.drop(columns=[
            'value_between_companies', 'value_new_investments',
            'value_re_investments'
        ],
                inplace=True)

        base = ['ent_id', 'year', 'quarter_id']
        df_final = pd.DataFrame()
        for option in [
                'between_companies', 'new_investments', 're_investments'
        ]:
            temp = df[base +
                      ['count_{}'.format(option), 'value_{}_c'.format(option)
                       ]].copy()
            temp.columns = ['ent_id', 'year', 'quarter_id', 'count', 'value_c']
            temp.dropna(subset=['value_c'], inplace=True)
            temp['investment_type'] = option
            df_final = df_final.append(temp)
        df = df_final.copy()

        df['investment_type'].replace(INVESTMENT_TYPE, inplace=True)

        temp = pd.DataFrame()
        for ent in list(df['ent_id'].unique()):
            temp = temp.append(
                validate_category(df.loc[(df['ent_id'] == ent)],
                                  'investment_type', 'value_c', 'c'))

        df = temp.copy()
        df = df.loc[df['value_c'].astype(str).str.lower() != 'c'].copy()
        df['value_c'] = df['value_c'].astype(float)

        return df
Beispiel #2
0
    def run_step(self, prev, params):
        data = prev

        result = {}
        historic = {}
        last_period = {}

        for sheet in params.get('sheets'):
            df = pd.read_excel(data, sheet_name=sheet)
            df.columns = df.columns.str.strip()
            df.rename(columns=FDI_COLUMNS, inplace=True)

            pk_id = [
                x for x in df.columns
                if x in ['sector_id', 'subsector_id', 'industry_group_id']
            ][0]

            df.dropna(subset=[params.get('level')], inplace=True)

            # get end_id dimension
            dim_geo, dim_country = get_dimensions()

            if params.get('level') == 'ent_id':
                df['ent_id'].replace(dict(
                    zip(dim_geo['ent_name'], dim_geo['ent_id'])),
                                     inplace=True)

            else:
                df['country_id'].replace(COUNTRY_REPLACE, inplace=True)
                df['country_id'].replace(dict(
                    zip(dim_country['country_name_es'], dim_country['iso3'])),
                                         inplace=True)
                df['country_id'] = df['country_id'].replace(
                    missing_replacements)
                # filter "Otros países"
                df = df.loc[df['country_id'] != 'xxa'].copy()

            split = df[pk_id].str.split(' ', n=1, expand=True)
            df[pk_id] = split[0]
            df[pk_id] = df[pk_id].astype(int)

            if pk_id == 'sector_id':
                df['sector_id'].replace(SECTOR_REPLACE, inplace=True)
                df['sector_id'] = df['sector_id'].astype(str)

            temp = pd.DataFrame()
            top_3_historic = pd.DataFrame()
            top_3_last_period = pd.DataFrame()

            for ele in list(df[pk_id].unique()):
                # top 3 acumulan mas IED 1999 - 2020
                temp = df.loc[df[pk_id] == ele, [params.get('level'), pk_id, 'value', 'count', 'value_c']] \
                    .groupby(by=[params.get('level'), pk_id]).sum().reset_index().sort_values(by=['value'], ascending=False)[:3]
                temp['top'] = range(1, temp.shape[0] + 1)

                # "C" values
                temp.loc[temp['count'] < LIMIT_C, 'value'] = 'C'

                top_3_historic = top_3_historic.append(temp, sort=False)

                # top 3 acumulan mas IED ultimo periodo
                temp = df.loc[
                    df[pk_id] == ele,
                    ['year',
                     params.get('level'), pk_id, 'value', 'count']].groupby(
                         by=['year', params.get('level'), pk_id
                             ]).sum().reset_index()
                temp = temp.loc[temp['year'] ==
                                temp['year'].max()].sort_values(
                                    by=['value'], ascending=False)[:3]
                temp['top'] = range(1, temp.shape[0] + 1)

                # "C" values
                temp.loc[temp['count'] < LIMIT_C, 'value'] = 'C'

                top_3_last_period = top_3_last_period.append(temp, sort=False)

            if params.get('level') == 'ent_id':
                top_3_historic['ent_name'] = top_3_historic['ent_id']
                top_3_historic['ent_name'].replace(dict(
                    zip(dim_geo['ent_id'], dim_geo['ent_name'])),
                                                   inplace=True)
                top_3_historic = top_3_historic[[
                    'ent_id', 'ent_name', pk_id, 'value', 'count', 'top'
                ]].copy()

                top_3_last_period['ent_name'] = top_3_last_period['ent_id']
                top_3_last_period['ent_name'].replace(dict(
                    zip(dim_geo['ent_id'], dim_geo['ent_name'])),
                                                      inplace=True)
                top_3_last_period = top_3_last_period[[
                    'ent_id', 'ent_name', pk_id, 'value', 'count', 'year'
                ]].copy()

            else:
                top_3_historic['country_name'] = top_3_historic['country_id']
                top_3_historic['country_name'].replace(dict(
                    zip(dim_country['iso3'], dim_country['country_name_es'])),
                                                       inplace=True)
                top_3_historic['country_name_en'] = top_3_historic[
                    'country_id']
                top_3_historic['country_name_en'].replace(dict(
                    zip(dim_country['iso3'], dim_country['country_name'])),
                                                          inplace=True)
                top_3_historic = top_3_historic[[
                    'country_id', 'country_name', 'country_name_en', pk_id,
                    'value', 'count', 'top'
                ]].copy()

                top_3_last_period['country_name'] = top_3_last_period[
                    'country_id']
                top_3_last_period['country_name'].replace(dict(
                    zip(dim_country['iso3'], dim_country['country_name_es'])),
                                                          inplace=True)
                top_3_last_period['country_name_en'] = top_3_last_period[
                    'country_id']
                top_3_last_period['country_name_en'].replace(dict(
                    zip(dim_country['iso3'], dim_country['country_name'])),
                                                             inplace=True)
                top_3_last_period = top_3_last_period[[
                    'country_id', 'country_name', 'country_name_en', pk_id,
                    'value', 'count', 'year'
                ]].copy()

            historic[pk_id.split('_id')[0]] = top_3_historic.to_dict(
                orient='records')
            last_period[pk_id.split('_id')[0]] = top_3_last_period.to_dict(
                orient='records')

        result['top3_industry_ent_historic'] = historic
        result['top3_industry_ent_last_period'] = last_period

        with open('{}.json'.format(params.get('file_name')), 'w') as outfile:
            json.dump(result, outfile)

        return df
Beispiel #3
0
    def run_step(self, prev, params):
        data = prev

        result = {}
        historic = {}
        last_period = {}

        df = pd.read_excel(data, sheet_name=params.get('sheets'))
        df.columns = df.columns.str.strip()
        df.rename(columns=FDI_COLUMNS, inplace=True)

        pk_id = params.get('level')

        df.dropna(subset=[params.get('level')], inplace=True)

        # get end_id dimension
        dim_geo, dim_country = get_dimensions()

        if params.get('level') == 'ent_id':
            df = df.loc[~df['ent_id'].str.contains('Total')].copy()
            df['ent_id'].replace(dict(
                zip(dim_geo['ent_name'], dim_geo['ent_id'])),
                                 inplace=True)
            df['ent_id'] = df['ent_id'].astype(int)

        df['country_id'].replace(COUNTRY_REPLACE, inplace=True)
        df['country_id'].replace(dict(
            zip(dim_country['country_name_es'], dim_country['iso3'])),
                                 inplace=True)
        df['country_id'] = df['country_id'].replace(missing_replacements)
        # filter "Otros países"
        df = df.loc[df['country_id'] != 'xxa'].copy()

        df.loc[df['country_id'] == 'Estados Unidos', 'country_id'] = 'usa'
        df.loc[df['country_id'] == 'Reino Unido', 'country_id'] = 'gbr'
        df.loc[df['country_id'] == 'Chequia', 'country_id'] = 'cze'

        # For each country, select top 3 entities that receive FDI
        temp = pd.DataFrame()
        top_3_historic = pd.DataFrame()
        top_3_last_period = pd.DataFrame()

        for ele in list(df['country_id'].unique()):
            # top 3 acumulan mas IED 1999 - 2021
            temp = df.loc[
                df['country_id'] == ele,
                [pk_id, 'country_id', 'value', 'count', 'value_c']].groupby(
                    by=[pk_id, 'country_id']).sum().reset_index().sort_values(
                        by=['value'], ascending=False)[:3]
            temp['top'] = range(1, temp.shape[0] + 1)

            # "C" values
            temp.loc[temp['count'] < LIMIT_C, 'value'] = 'C'

            top_3_historic = top_3_historic.append(temp, sort=False)

            # top 3 acumulan mas IED ultimo periodo
            temp = df.loc[
                df['country_id'] == ele,
                ['year', pk_id, 'country_id', 'value', 'count', 'value_c'
                 ]].groupby(
                     by=['year', pk_id, 'country_id']).sum().reset_index()
            temp = temp.loc[temp['year'] == temp['year'].max()].sort_values(
                by=['value'], ascending=False)[:3]
            temp['top'] = range(1, temp.shape[0] + 1)

            # "C" values
            temp.loc[temp['count'] < LIMIT_C, 'value'] = 'C'

            top_3_last_period = top_3_last_period.append(temp, sort=False)

        level_keys = {
            'ent_id': [pk_id, 'ent_name'],
            'investment_type': [pk_id]
        }

        if params.get('level') == 'ent_id':
            top_3_historic['ent_name'] = top_3_historic['ent_id']
            top_3_historic['ent_name'].replace(dict(
                zip(dim_geo['ent_id'], dim_geo['ent_name'])),
                                               inplace=True)
            top_3_historic = top_3_historic[[
                'country_id', pk_id, 'ent_name', 'value', 'count', 'top'
            ]].copy()

            top_3_last_period['ent_name'] = top_3_last_period['ent_id']
            top_3_last_period['ent_name'].replace(dict(
                zip(dim_geo['ent_id'], dim_geo['ent_name'])),
                                                  inplace=True)
            top_3_last_period = top_3_last_period[[
                'country_id', pk_id, 'ent_name', 'value', 'count', 'year'
            ]].copy()

        top_3_historic['country_name'] = top_3_historic['country_id']
        top_3_historic['country_name'].replace(dict(
            zip(dim_country['iso3'], dim_country['country_name_es'])),
                                               inplace=True)
        top_3_historic['country_name_en'] = top_3_historic['country_id']
        top_3_historic['country_name_en'].replace(dict(
            zip(dim_country['iso3'], dim_country['country_name'])),
                                                  inplace=True)
        top_3_historic = top_3_historic[
            ['country_id', 'country_name', 'country_name_en'] +
            level_keys[pk_id] +
            ['value', 'count', 'top']].copy().reset_index(drop=True)

        top_3_last_period['country_name'] = top_3_last_period['country_id']
        top_3_last_period['country_name'].replace(dict(
            zip(dim_country['iso3'], dim_country['country_name_es'])),
                                                  inplace=True)
        top_3_last_period['country_name_en'] = top_3_last_period['country_id']
        top_3_last_period['country_name_en'].replace(dict(
            zip(dim_country['iso3'], dim_country['country_name'])),
                                                     inplace=True)
        top_3_last_period = top_3_last_period[
            ['country_id', 'country_name', 'country_name_en'] +
            level_keys[pk_id] +
            ['value', 'count', 'year']].copy().reset_index(drop=True)

        top_3_last_period['year'] = top_3_last_period['year'].astype(int)

        historic[pk_id.split('_id')[0]] = top_3_historic.to_dict(
            orient='records')
        last_period[pk_id.split('_id')[0]] = top_3_last_period.to_dict(
            orient='records')

        result['{}_historic'.format(params.get('file_name'))] = historic
        result['{}_last_period'.format(params.get('file_name'))] = last_period

        with open('{}.json'.format(params.get('file_name')), 'w') as outfile:
            json.dump(result, outfile)

        return result