예제 #1
0
    def run_step(self, prev, params):
        logger.info('Municipality - Sector Step...')
        df_ent_sec, df_ent_sub, df_ent_ram, base = prev

        df = base.copy()

        df = df.loc[(df['CODIGO'].str.strip().str.len() == 2) |
                    (df['CODIGO'] == '31-33') |
                    (df['CODIGO'].str.strip() == '48-49')].copy()

        df = df.loc[df['ID_ESTRATO'].isna()].copy()
        df = df.loc[~df['MUNICIPIO'].isna()].copy()
        df = df.loc[~df['CODIGO'].isna()].copy()

        df['sector_id'] = df['CODIGO'].astype(str).str.strip()
        df['mun_id'] = (df['ENTIDAD'].astype(str) +
                        df['MUNICIPIO'].astype(str).str.zfill(3)).astype(int)

        df.drop(columns=['ID_ESTRATO', 'CODIGO', 'MUNICIPIO', 'ENTIDAD'],
                inplace=True)

        df = fill_level(df, FILL_COLUMNS)

        # Municipio-Sector
        df['level'] = 4

        df_mun_sec = df.copy()

        return df_ent_sec, df_ent_sub, df_ent_ram, df_mun_sec
    def run_step(self, prev, params):
        logger.info("Tidying up DataFrame...")

        df = prev

        for i in range(1,9):
            df[i]["region"] = df[i]["region"].str.title()
            
            df[i]["data_origin"] = "INEI" if i in [1,2,3,4] else "ENE"
            
            if i in [1,2,3,4]:
                df[i] = df[i].rename(columns={"censo":"year"})
                df[i]["year"] = df[i]["year"].str[-4:] 
                df[i]["year"] = df[i]["year"].astype(int)
            else:
                df[i]["year"] = 2017
            
            response_col = 2 if i in [1,2,3,4] else 1 
            df[i]["variable"] = VARIABLE_DICT[i]
            df[i]["response"] = df[i].iloc[:, response_col]
            
            df[i] = df[i].rename(columns={"valor_porcentaje": "percentage"})
            
            df[i] = df[i][["region", "data_origin", "year", "variable", "response", "percentage"]]

    
        df_list = [df[i] for i in range(1,9)]
        df = pd.concat(df_list, ignore_index=True)
        df.to_csv("data_temp/tidy_file.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)

        return 0
예제 #3
0
    def run_step(self, prev, params):
        logger.info('State - Subsector Step...')
        df_ent_sec, base = prev

        df = base.copy()

        df = df.loc[df['CODIGO'].str.strip().str.len() == 3].copy()

        df = df.loc[df['ID_ESTRATO'].isna()].copy()
        df = df.loc[df['MUNICIPIO'].isna()].copy()
        df = df.loc[~df['CODIGO'].isna()].copy()

        df['subsector_id'] = df['CODIGO'].astype(int)
        df['ent_id'] = df['ENTIDAD'].astype(int)

        df.drop(columns=['ID_ESTRATO', 'CODIGO', 'MUNICIPIO', 'ENTIDAD'],
                inplace=True)

        df = fill_level(df, FILL_COLUMNS)

        # Entidad-Subsector
        df['level'] = 2

        df_ent_sub = df.copy()

        return df_ent_sec, df_ent_sub, base
    def run_step(self, prev, params):
        logger.info("Applying Transformations...")

        df = pd.read_csv(prev)
        df = df[2306:]

        # Country Dim
        dim = pd.read_csv("data_temp/dim_country.csv")
        dim_dict = {
            k: v
            for (k, v) in zip(dim["country_code"], dim["country_id"])
        }

        df["Country Code"] = df["Country Code"].map(dim_dict)

        # Tidying Up
        col_names = {
            "Year": "year",
            "Country Code": "country_id",
            "Value": "gdp_value"
        }
        df = df.rename(columns=col_names)

        df = df[["year", "country_id", "gdp_value"]]

        return df
예제 #5
0
    def run_step(self, prev, params):
        logger.info('Read Step...')

        label = ''
        data = []
        for ele in prev:
            if 'diccionario' in ele[1]['file']:
                label = ele[0]
            elif 'ce2019_nac' in ele[1]['file']:
                continue
            elif 'Indicators' in ele[1]['file']:
                continue
            else:
                data.append(ele[0])

        t = pd.DataFrame()
        for ele in data:
            df = pd.read_csv(ele, engine='python', sep="[\t]*,[\t]*", header=0)

            df = df.reset_index()

            # column names
            labels = pd.read_csv(label)
            columns = list(labels.reset_index()['index'])
            df.drop(columns=['ENTIDAD'], inplace=True)
            df.columns = columns
            df.replace(' ', np.nan, inplace=True)
            t = t.append(df)

        df = t.copy()
        t = pd.DataFrame()

        df['year'] = 2019

        return df
예제 #6
0
    def run_step(self, prev, params):
        logger.info("Creating Fact Table...")

        df = pd.read_csv("data_temp/tidy_file.csv")

        df_reg = pd.read_csv("data_output/tic_dim_region.csv")
        region_map = {
            k: v
            for (k, v) in zip(df_reg["region_name"], df_reg["region_id"])
        }
        df["region_id"] = df["region"].map(region_map)

        origin_map = {"INEI": 1, "ENE": 0}
        df["data_origin_id"] = df["data_origin"].map(origin_map)

        df_var = pd.read_csv("data_output/tic_dim_variable.csv")
        variable_map = {
            k: v
            for (k, v) in zip(df_var["combined"], df_var["response_id"])
        }
        df["combined"] = df["variable"] + "|" + df["response"]
        df["response_id"] = df["combined"].map(variable_map)

        df = df[[
            "region_id", "data_origin_id", "response_id", "year", "percentage"
        ]]

        return df
예제 #7
0
    def run_step(self, prev, params):
        logger.info("Opening files from source folder...")

        df = {
            i: pd.read_excel("data_source/chart{}.xlsx".format(i))
            for i in range(1, 9)
        }

        return df
    def run_step(self, prev, params):
        logger.info("Creating Region Dimension...")

        df = pd.read_csv("data_temp/tidy_file.csv")

        region_list = list(df["region"].unique())
        df_region = pd.DataFrame({"region_id": list(range(len(region_list))), "region_name": sorted(region_list)})

        return df_region
    def run_step(self, prev, params):
        logger.info("Transforming {}".format(params.get("filename")))
        df = prev

        column_names = {
            "stateName": "state_name",
            "weekEnded": "week_ended",
            "InitialClaims": "initial_claims",
            "ReflectingWeekEnded": "reflecting_week_end",
            "ContinuedClaims": "continued_claims",
            "CoveredEmployment": "covered_employment",
            "InsuredUnemploymentRate": "insured_unemployment_rate"
        }

        df = df.rename(columns=column_names)

        # US Mapping
        fips_map = us.states.mapping("name", "fips")
        df["fips_code"] = "04000US" + df["state_name"].map(fips_map)

        # Columns order and type
        df = df[[
            "week_ended", "reflecting_week_end", "fips_code", "state_name",
            "initial_claims", "continued_claims", "covered_employment",
            "insured_unemployment_rate"
        ]]

        for c in ["week_ended", "reflecting_week_end"]:
            df[c] = pd.to_datetime(df[c], format="%m/%d/%Y")

        for c in ["initial_claims", "continued_claims", "covered_employment"]:
            df[c] = df[c].str.replace(",", "").astype("int64")

        df["insured_unemployment_rate"] = df[
            "insured_unemployment_rate"].astype("float64")

        # Sort by latest date
        df = df.sort_values(by=["reflecting_week_end"], ascending=False)

        # Append to CVS
        alabama_bool = params.get("filename") == "A-M.xml"
        df.to_csv("./unemployment_output/partial_output.csv",
                  header=alabama_bool,
                  index=False,
                  mode="a",
                  quoting=csv.QUOTE_NONNUMERIC)

        return df
예제 #10
0
    def run_step(self, prev_result, params):

        logger.info('Running Read step...')

        df_dict = df_dict = pd.read_excel(prev_result,
                                          sheet_name=None,
                                          skiprows=2)

        df = pd.DataFrame()
        for key, df_temp in df_dict.items():
            df_temp = df_temp[(df_temp.Periodo.str[0] == '1') |
                              (df_temp.Periodo.str[0] == '2')]
            df_temp = df_temp.melt(id_vars='Periodo',
                                   var_name='name').sort_values(by='Periodo')
            df = pd.concat([df, df_temp])

        return df
예제 #11
0
    def run_step(self, prev_result, params):
        logger.info('Running expenses type dimension step...')

        results = self.get_pipeline_results_ref()

        df = results["global_df"]
        df = df[['desc_tipogasto']]
        df = df.groupby(['desc_tipogasto'
                         ]).size().to_frame(name="count").reset_index()

        df['desc_tipogasto_mod'] = df['desc_tipogasto'].apply(
            lambda x: unidecode.unidecode(x.lower()))
        df.drop_duplicates(subset='desc_tipogasto_mod',
                           keep='first',
                           inplace=True)

        df.drop(columns=['count'], inplace=True)

        df.insert(0, 'id_tipogasto', range(1, 1 + len(df)))

        df['exp_type_des_en'] = df['desc_tipogasto'].apply(
            lambda x: GoogleTranslator(source='auto', target='en').translate(x
                                                                             ))
        df['exp_type_des_en'] = df['exp_type_des_en'].apply(
            lambda x: x.capitalize())

        df['desc_tipogasto'] = df['desc_tipogasto'].apply(
            lambda x: speller.correction(x))
        df['desc_tipogasto'] = df['desc_tipogasto'].apply(
            lambda x: x.capitalize())

        df.rename(columns={
            'desc_tipogasto': 'exp_type_des',
            'id_tipogasto': 'exp_type_id'
        },
                  inplace=True)

        df['exp_type_id'] = df['exp_type_id'].astype(str)

        df = df[[
            'exp_type_id', 'exp_type_des', 'exp_type_des_en',
            'desc_tipogasto_mod'
        ]]

        return df
예제 #12
0
    def run_step(self, prev_result, params):

        logger.info('Running Xform step...')

        df = prev_result

        df['month_id'] = df.Periodo.str.replace('/', '')
        df['trade_flow_id'] = df.name.apply(
            lambda x: 1 if 'Importaciones' in x.split() else 2)
        df['petroleum'] = df.name.apply(lambda x: 1
                                        if 'Petroleras' in x.split() else 2)
        df['country_name'] = df.name.apply(lambda x: x.split(' > ')[-1])

        dim_country = query_to_df(self.connector,
                                  'select * from dim_shared_country')

        dim_country = dim_country[['country_name_es', 'iso3']]
        iso3_dict_items_list = list(iso3_dict.items())
        df_temp = pd.DataFrame(iso3_dict_items_list,
                               columns=['country_name_es', 'iso3'])
        dim_country = pd.concat([dim_country, df_temp])

        for string in strings_to_remove:
            df.country_name = df.country_name.str.replace(string, '', 1)

        df = df[(~df.country_name.str.contains('total'))
                & ~df.country_name.str.contains('Total')]

        df = df.merge(dim_country[[
            'iso3', 'country_name_es'
        ]].drop_duplicates(subset=['country_name_es']),
                      left_on='country_name',
                      right_on='country_name_es',
                      how='left')

        df = df[['month_id', 'iso3', 'trade_flow_id', 'petroleum', 'value']]

        df = df[df.value != 'C']

        df.value = df.value.astype(float)
        df.month_id = df.month_id.astype(int)
        df.value = df.value.fillna(0)
        df.value = df.value * 1000

        return df
예제 #13
0
    def run_step(self, prev, params):
        logger.info('Join Step...')
        df_ent_sec, df_ent_sub, df_ent_ram, df_mun_sec = prev

        df = pd.DataFrame()
        for _df in [df_ent_sec, df_ent_sub, df_ent_ram, df_mun_sec]:
            df = df.append(_df, sort=False)

        df[list(df.columns[df.columns != 'sector_id'])] = df[list(
            df.columns[df.columns != 'sector_id'])].astype(float)
        df['sector_id'] = df['sector_id'].astype(str)
        df[['UE', 'level']] = df[['UE', 'level']].astype(int)

        df['year'] = 2019

        df.columns = df.columns.str.lower()

        return df
예제 #14
0
    def run_step(self, prev, params):
        filename = "Wakanda_{}_{}.csv".format(params.get("trade_flow"), params.get("year"))
        logger.info("Transforming {}...".format(filename))
        df = prev

        # 1. Change order of columns
        cols = list(df.columns)
        cols = [cols[-1]] + cols[:-1]
        df = df[cols]
        
        # 2. Melt Months
        month_list = [m.upper() for m in calendar.month_name[1:]]
        df = df.melt(id_vars=["ORIGIN_OR_DESTINATION"], value_vars=month_list, var_name="month", value_name="total")

        # 3. Change column names
        df = df.rename(columns={"ORIGIN_OR_DESTINATION": "country"})

        # 4. Drop NaN values
        df = df.dropna()

        # 5. Map country names to ISO3
        country_df = pd.read_csv("resources/country_iso3_codes.csv")
        country_map = {k:v for (k,v) in zip(country_df["country_name"], country_df["country_iso3"])}
        df["country"] = df["country"].map(country_map).str.lower()

        # 6. Map month names to numeric
        month_list = [m.upper() for m in calendar.month_name]
        month_map = {month_list[i]: i for i in range(1,13)}
        df["month"] = df["month"].map(month_map)

        # 7. Create trade flow column
        flow_map = {"IMP": 1, "EXP": 2}
        df["trade_flow"] = flow_map[params["trade_flow"]]

        # 8. Create year column and time_id column
        df["year"] = params["year"]
        df["time_id"] = (df["year"] + df["month"].astype(str).str.zfill(2)).astype(int)
        df = df[["time_id", "country", "trade_flow", "total"]]

        # 9. Print the DataFrame
        print(df.head())
        print(df.isnull().any())

        return df
    def run_step(self, prev, params):
        logger.info("Creating Variable Dimension...")

        df = pd.read_csv("data_temp/tidy_file.csv")

        df_var = df[["variable", "response"]].copy()

        df_var["combined"] = df_var["variable"] + "|" + df_var["response"]
        df_var = df_var[["combined"]]

        df_var = df_var.drop_duplicates().reset_index(drop=True)

        df_var["response_id"] = df_var.index
        df_var["variable_name"] = df_var["combined"].str.split("|").str[0]
        df_var["response_name"] = df_var["combined"].str.split("|").str[1]

        df_var = df_var[["response_id", "variable_name", "response_name", "combined"]]

        return df_var
    def run_step(self, prev, params):
        logger.info("Creating Country Dimension...")

        dim = pd.read_csv("https://datahub.io/core/gdp/r/gdp.csv")
        dim = dim[2306:]

        dim = dim[["Country Name",
                   "Country Code"]].drop_duplicates().reset_index(drop=True)
        dim["country_id"] = dim.index + 1
        dim = dim.rename(columns={
            "Country Name": "country_name",
            "Country Code": "country_code"
        })
        dim = dim[["country_id", "country_name", "country_code"]]

        dim.to_csv("data_temp/dim_country.csv",
                   index=False,
                   quoting=csv.QUOTE_NONNUMERIC)

        return dim
    def run_step(self, prev, params):
        logger.info("Processing {}".format(params.get("filename")))

        with open("unemployment_data/{}".format(params["filename"]),
                  "r") as file:
            xml = file.read()

        xml_dict = xmltodict.parse(xml)
        json_data = json.dumps(xml_dict)
        json_dict = json.loads(json_data)
        data = json_dict['r539cyState']['week']

        data_rows = {k: [] for k in data[0].keys()}

        for i in range(len(data)):
            for k in data[i].keys():
                data_rows[k].append(data[i][k])

        df = pd.DataFrame({k: data_rows[k] for k in data_rows.keys()})

        return df
예제 #18
0
    def run_step(self, prev_result, params):

        logger.info('Running Xform step...')

        df = prev_result

        df['month_id'] = df.Periodo.str.replace('/', '')
        df['trade_flow_id'] = df.name.apply(
            lambda x: 1 if 'Importaciones' in x.split() else 2)

        df = df[df.value != 'C']
        df.value = df.value.fillna('0')
        df.value = df.value.astype(int)

        df['section'] = df.name.str.extract(r'(Sección\s\w{1,5})')
        df['chapter'] = df.name.str.extract(r'(Capítulo(|s)\s\d{1,3})')[0]
        df.chapter = df.chapter.str.replace('s', '')

        df.reset_index(drop=True, inplace=True)

        df = df[df['name'].str.contains('Total capítulo')]

        df['section_id'] = df.section.apply(
            lambda x: str(roman.fromRoman(x.split(' ')[-1])))
        df['chapter_id'] = df.chapter.apply(lambda x: x.split(' ')[-1])
        df['hs2_id'] = df.section_id + df.chapter_id

        df.drop(['Periodo', 'section', 'chapter'], axis=1, inplace=True)

        df = df[['month_id', 'trade_flow_id', 'hs2_id', 'value']]

        df.value = df.value.astype(float)
        df.month_id = df.month_id.astype(int)
        df.hs2_id = df.hs2_id.astype(int)
        df.value = df.value.fillna(0)
        df.value = df.value * 1000

        return df
예제 #19
0
    def run_step(self, prev_result, params):
        logger.info('Running function dimension step...')
        df = prev_result[[
            'gpo_funcional', 'desc_gpo_funcional', 'id_funcion',
            'desc_funcion', 'id_subfuncion', 'desc_subfuncion'
        ]]
        df = df.groupby([
            'gpo_funcional', 'desc_gpo_funcional', 'id_funcion',
            'desc_funcion', 'id_subfuncion', 'desc_subfuncion'
        ]).size().to_frame(name="count").reset_index()

        df['desc_gpo_funcional_mod'] = df['desc_gpo_funcional'].apply(
            lambda x: unidecode.unidecode(x.lower()))
        # df.drop_duplicates(subset='desc_gpo_funcional_mod', keep='first', inplace=True)

        df['desc_funcion_mod'] = df['desc_funcion'].apply(
            lambda x: unidecode.unidecode(x.lower()))
        # df.drop_duplicates(subset='desc_funcion_mod', keep='first', inplace=True)

        df['desc_subfuncion_mod'] = df['desc_subfuncion'].apply(
            lambda x: unidecode.unidecode(x.lower()))
        df.drop_duplicates(subset='desc_subfuncion_mod',
                           keep='first',
                           inplace=True)

        #df.drop(columns=['count', 'desc_gpo_funcional_mod', 'desc_funcion_mod','desc_subfuncion_mod'],inplace=True)
        df.drop(columns=['count'], inplace=True)

        df['desc_subfuncion_en'] = df['desc_subfuncion'].apply(
            lambda x: GoogleTranslator(source='auto', target='en').translate(x
                                                                             ))
        df['desc_funcion_en'] = df['desc_funcion'].apply(
            lambda x: GoogleTranslator(source='auto', target='en').translate(x
                                                                             ))
        df['desc_gpo_funcional_en'] = df['desc_gpo_funcional'].apply(
            lambda x: GoogleTranslator(source='auto', target='en').translate(x
                                                                             ))

        df['gpo_funcional'] = df['gpo_funcional'].astype(str)
        df['id_funcion'] = df['id_funcion'].astype(str)
        df['id_subfuncion'] = df['id_subfuncion'].astype(str)

        df['gpo_funcional'] = df['gpo_funcional'].apply(
            lambda x: x.split('.')[0])
        df['id_funcion'] = df['id_funcion'].apply(
            lambda x: '0' + x.split('.')[0])
        df['id_subfuncion'] = df['id_subfuncion'].apply(
            lambda x: '0' + x.split('.')[0])

        df['function_id'] = df['gpo_funcional'] + df['id_funcion']
        df['subfunction_id'] = df['gpo_funcional'] + df['id_funcion'] + df[
            'id_subfuncion']

        df.rename(columns={
            'gpo_funcional': 'functional_group_id',
            'desc_gpo_funcional': 'functional_group_desc_es',
            'desc_funcion': 'function_des_es',
            'desc_subfuncion': 'subfunction_des_es',
            'desc_gpo_funcional_en': 'functional_group_desc_en',
            'desc_funcion_en': 'function_des_en',
            'desc_subfuncion_en': 'subfunction_des_en'
        },
                  inplace=True)

        df.drop(columns=['id_funcion', 'id_subfuncion'], inplace=True)

        df = df[[
            'functional_group_id', 'functional_group_desc_es',
            'functional_group_desc_en', 'function_id', 'function_des_es',
            'function_des_en', 'subfunction_id', 'subfunction_des_es',
            'subfunction_des_en', 'desc_subfuncion_mod'
        ]]

        df['functional_group_desc_es'] = df['functional_group_desc_es'].apply(
            lambda x: speller.correction(x))
        df['function_des_es'] = df['function_des_es'].apply(
            lambda x: speller.correction(x))
        df['subfunction_des_es'] = df['subfunction_des_es'].apply(
            lambda x: speller.correction(x))

        df['functional_group_desc_en'] = df['functional_group_desc_en'].apply(
            lambda x: x.capitalize())
        df['function_des_en'] = df['function_des_en'].apply(
            lambda x: x.capitalize())
        df['subfunction_des_en'] = df['subfunction_des_en'].apply(
            lambda x: x.capitalize())

        df['functional_group_desc_es'] = df['functional_group_desc_es'].apply(
            lambda x: x.capitalize())
        df['function_des_es'] = df['function_des_es'].apply(
            lambda x: x.capitalize())
        df['subfunction_des_es'] = df['subfunction_des_es'].apply(
            lambda x: x.capitalize())

        return df
예제 #20
0
    def run_step(self, prev_result, params):
        logger.info('Running chapter-concept dimension step...')

        results = self.get_pipeline_results_ref()

        df = results["global_df"]
        df = df[[
            'id_capitulo', 'desc_capitulo', 'id_concepto', 'desc_concepto'
        ]]
        df = df.groupby(
            ['id_capitulo', 'desc_capitulo', 'id_concepto',
             'desc_concepto']).size().to_frame(name="count").reset_index()

        df['desc_capitulo_mod'] = df['desc_capitulo'].apply(
            lambda x: unidecode.unidecode(x.lower()))
        # df.drop_duplicates(subset='desc_capitulo_mod', keep='first', inplace=True)

        df['desc_concepto_mod'] = df['desc_concepto'].apply(
            lambda x: unidecode.unidecode(x.lower()))
        df.drop_duplicates(subset='desc_concepto_mod',
                           keep='first',
                           inplace=True)

        #df.drop(columns=['count', 'desc_capitulo_mod', 'desc_concepto_mod'],inplace=True)
        df.drop(columns=['count'], inplace=True)

        df['chapter_des_en'] = df['desc_capitulo'].apply(
            lambda x: GoogleTranslator(source='auto', target='en').translate(x
                                                                             ))
        df['chapter_des_en'] = df['chapter_des_en'].apply(
            lambda x: x.capitalize())

        df['desc_capitulo'] = df['desc_capitulo'].apply(
            lambda x: speller.correction(x))
        df['desc_capitulo'] = df['desc_capitulo'].apply(
            lambda x: x.capitalize())

        df['concept_des_en'] = df['desc_concepto'].apply(
            lambda x: GoogleTranslator(source='auto', target='en').translate(x
                                                                             ))
        df['concept_des_en'] = df['concept_des_en'].apply(
            lambda x: x.capitalize())
        df['desc_concepto'] = df['desc_concepto'].apply(
            lambda x: x.capitalize())

        df.rename(columns={
            'desc_capitulo': 'chapter_des',
            'id_capitulo': 'chapter_id',
            'desc_concepto': 'concept_des',
            'id_concepto': 'concept_id'
        },
                  inplace=True)

        df.loc[df['concept_des'] == 'No registrado', [
            'chapter_id', 'chapter_des', 'chapter_des_en', 'concept_id',
            'concept_des', 'concept_des_en'
        ]] = [
            0, 'No registrado', 'Not registered', 999, 'No registrado',
            'Not registered'
        ]

        df['chapter_id'] = df['chapter_id'].astype(str)
        df['concept_id'] = df['concept_id'].astype(str)

        df = df[[
            'chapter_id', 'chapter_des', 'chapter_des_en', 'concept_id',
            'concept_des', 'concept_des_en', 'desc_concepto_mod'
        ]]

        return df
예제 #21
0
    def run_step(self, prev_result, params):
        logger.info('Running read step...')
        files_list = prev_result
        df = pd.DataFrame()

        for file_ in files_list:
            temp_df = pd.read_excel(file_[0])
            temp_df.columns = temp_df.columns.str.lower()
            if file_[1]['filename'] in [
                    'pef_ac01_avance_2t_2020', 'pef_ac01_avance_2t_2021',
                    'pef_ac01_avance_3t_2020', 'pef_ac01_avance_1t_2021',
                    'pef_ac01_avance_1t_2020'
            ]:
                temp_df = temp_df[columns_required_1_2]
                temp_df.columns = columns_required_1
                temp_df['quarter_id'] = files[file_[1]['filename'] + '.xlsx']
                df = pd.concat([df, temp_df])
            elif file_[1]['filename'] in [
                    'ac01_avance_2t19', 'ac01_avance_marzo19',
                    'pef_ac01_avance_4t_2020'
            ]:
                temp_df = temp_df[columns_required_1_1]
                temp_df.columns = columns_required_1
                temp_df['quarter_id'] = files[file_[1]['filename'] + '.xlsx']
                df = pd.concat([df, temp_df])
            elif file_[1]['filename'] in ['pef_ac01_avance_2t_2016']:
                temp_df = temp_df[columns_required_2]
                temp_df.columns = columns_required_1
                temp_df['quarter_id'] = files[file_[1]['filename'] + '.xlsx']
                df = pd.concat([df, temp_df])
            else:
                temp_df = temp_df[columns_required_1]
                temp_df['quarter_id'] = files[file_[1]['filename'] + '.xlsx']
                df = pd.concat([df, temp_df])

        df[['monto_pagado',
            'monto_aprobado']] = df[['monto_pagado',
                                     'monto_aprobado']].fillna(value=0)

        df.dropna(inplace=True)

        df['desc_gpo_funcional'] = df['desc_gpo_funcional'].apply(
            lambda x: x.capitalize())
        df['desc_funcion'] = df['desc_funcion'].apply(lambda x: x.capitalize())
        df['desc_subfuncion'] = df['desc_subfuncion'].apply(
            lambda x: x.capitalize())
        df['desc_ramo'] = df['desc_ramo'].apply(lambda x: x.capitalize())
        df['desc_tipogasto'] = df['desc_tipogasto'].apply(
            lambda x: x.capitalize())
        df['desc_capitulo'] = df['desc_capitulo'].apply(
            lambda x: x.capitalize())
        df['desc_concepto'] = df['desc_concepto'].apply(
            lambda x: x.capitalize())

        df = df[df.id_entidad_federativa != ' ']
        #df['id_entidad_federativa'].replace({'33','35'}, inplace=True) # CAMBIAR ESTO NO FUNCIONO!!! PQ ES UN INT NO UN STRING
        df['id_entidad_federativa'] = df['id_entidad_federativa'].apply(
            lambda x: 35 if x == 33 else x)
        df['id_entidad_federativa'] = df['id_entidad_federativa'].astype(int)

        df['monto_pagado'] = df['monto_pagado'].astype(float)

        return df
예제 #22
0
    def run_step(self, prev, params):
        logger.info("TransformStep")

        df = prev

        return df
예제 #23
0
    def run_step(self, prev, params):
        logger.info("OpenStep")

        return df
예제 #24
0
    def run_step(self, prev_result, params):
        logger.info('Running fact table step...')
        results = self.get_pipeline_results_ref()
        df = results["global_df"]
        dim_fun = results["dim_fun"]
        dim_cap = results["dim_cap"]
        dim_dep = results["dim_dep"]
        dim_exp = results["dim_exp"]

        df['desc_subfuncion_mod'] = df['desc_subfuncion'].apply(
            lambda x: unidecode.unidecode(x.lower()))
        df['desc_concepto_mod'] = df['desc_concepto'].apply(
            lambda x: unidecode.unidecode(x.lower()))
        df['desc_ramo_mod'] = df['desc_ramo'].apply(
            lambda x: unidecode.unidecode(x.lower()))
        df['desc_tipogasto_mod'] = df['desc_tipogasto'].apply(
            lambda x: unidecode.unidecode(x.lower()))

        df = df.merge(dim_fun.drop_duplicates(subset=['subfunction_des_es']),
                      left_on="desc_subfuncion_mod",
                      right_on="desc_subfuncion_mod",
                      how="left")
        df = df.merge(dim_cap.drop_duplicates(subset=['concept_des']),
                      left_on="desc_concepto_mod",
                      right_on="desc_concepto_mod",
                      how="left")
        df = df.merge(dim_dep.drop_duplicates(subset=['department_des']),
                      left_on="desc_ramo_mod",
                      right_on="desc_ramo_mod",
                      how="left")
        df = df.merge(dim_exp.drop_duplicates(subset=['exp_type_des']),
                      left_on="desc_tipogasto_mod",
                      right_on="desc_tipogasto_mod",
                      how="left")

        df.drop(columns=[
            'desc_ramo', 'id_ramo', 'desc_tipogasto', 'id_tipogasto',
            'gpo_funcional', 'desc_gpo_funcional', 'id_funcion',
            'desc_funcion', 'id_subfuncion', 'desc_subfuncion', 'id_capitulo',
            'desc_capitulo', 'id_concepto', 'desc_concepto',
            'functional_group_desc_es', 'functional_group_desc_en',
            'function_des_es', 'function_des_en', 'subfunction_des_es',
            'subfunction_des_en', 'chapter_des', 'chapter_des_en',
            'concept_des', 'concept_des_en', 'entidad_federativa',
            'desc_subfuncion_mod', 'desc_concepto_mod', 'desc_ramo_mod',
            'desc_tipogasto_mod'
        ],
                inplace=True)

        df.rename(
            columns={
                'monto_pagado': 'amount_paid',
                'monto_aprobado': 'amount_approved',
                'id_entidad_federativa': 'ent_id',
                # 'id_ramo':'department_id',
                # 'id_tipogasto':'exp_type_id'
            },
            inplace=True)

        df = df[[
            'quarter_id', 'ent_id', 'functional_group_id', 'function_id',
            'subfunction_id', 'department_id', 'exp_type_id', 'chapter_id',
            'concept_id', 'amount_paid', 'amount_approved'
        ]]

        df['amount_paid'] = df['amount_paid'].fillna(0)
        df['amount_approved'] = df['amount_approved'].fillna(0)

        df['amount_paid'] = df['amount_paid'].apply(lambda x: x == 0
                                                    if x == ' ' else x)
        df['amount_approved'] = df['amount_approved'].apply(lambda x: x == 0
                                                            if x == ' ' else x)

        df['quarter_id'] = df['quarter_id'].astype(int)

        df['department_id'] = df['department_id'].astype(str)
        df['exp_type_id'] = df['exp_type_id'].astype(str)

        df.dropna(subset=[
            'quarter_id', 'ent_id', 'functional_group_id', 'function_id',
            'subfunction_id', 'department_id', 'exp_type_id', 'chapter_id',
            'concept_id'
        ],
                  inplace=True)

        return df
예제 #25
0
 def run_step(self, prev, params):
     filename = "Wakanda_{}_{}.csv".format(params.get("trade_flow"), params.get("year"))
     logger.info("Opening {}...".format(filename))
     df = pd.read_csv("data/" + filename)
     
     return df
	def run_step(self, prev, params):
		logger.info("TransformStep...")
		result = prev

		return result