コード例 #1
0
def clean_tables(table):
    db_connector = Connector.fetch('clickhouse-database',
                                   open('../conns.yaml'))
    query = 'DROP TABLE {}'.format(table)
    query_to_df(db_connector, raw_query=query)
    print('Success! {}'.format(table))

    return 0
コード例 #2
0
    def run_step(self, prev, params):

        df = pd.read_excel(prev, sheet_name='CeNaPI al 17 nov 2021')
        df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

        df.columns = [
            'park_name_id', 'park_name', 'park_type_id', 'ent_id', 'mun_id',
            'companies'
        ]
        df = df.drop_duplicates(['park_name', 'ent_id', 'mun_id'])

        # replacing park names with id's --> to check for new parks
        query = 'SELECT park_id, park_name FROM dim_names_industrial_parks'
        geo_mun = query_to_df(
            self.connector,
            raw_query=query).drop_duplicates(subset=['park_id'])
        names_dict = dict(zip(geo_mun['park_name'], geo_mun['park_id']))

        # check differences in park ids
        df['park_name'] = df['park_name'].replace(names_dict)
        print(df.loc[df['park_name_id'] != df['park_name']])

        # set park type id
        df["park_type_id"] = df["park_type_id"].replace(park_type)

        # replacing mun names with id's
        query = 'SELECT ent_id, ent_name, mun_id, mun_name FROM dim_shared_geography_mun'
        geo_mun = query_to_df(
            self.connector, raw_query=query).drop_duplicates(subset=['mun_id'])
        ent_dict = dict(zip(geo_mun['ent_name'], geo_mun['ent_id']))

        df.loc[df['ent_id'] == 'México', 'ent_id'] = 'Estado de México'

        df['ent_id'] = df['ent_id'].replace(ent_dict)

        # selective replacement (as two municipalities of different entities may have same name)
        df_final = pd.DataFrame()
        for entity in list(df['ent_id'].unique()):
            geo_mun_aux = geo_mun[geo_mun['ent_id'] == entity]
            mun_dict = dict(zip(geo_mun_aux['mun_name'],
                                geo_mun_aux['mun_id']))
            df_temp = df[df['ent_id'] == entity].copy()
            df_temp['mun_id'] = df_temp['mun_id'].replace(mun_dict)
            df_final = pd.concat([df_final, df_temp], ignore_index=True)

        df_final['mun_id'] = df_final['mun_id'].replace(wrong_muns).astype(int)

        df_final["count_parks"] = 1
        df_final["companies"] = df_final["companies"].fillna(0).astype(int)

        df_final.drop(['ent_id', 'park_name'], axis=1, inplace=True)

        df_final = df_final.groupby(['mun_id', 'park_name_id',
                                     'park_type_id']).sum().reset_index()

        df_final.loc[df_final['park_name_id'] == 703, 'companies'] = 12

        return df_final
コード例 #3
0
def clean_tables(table_name: str):
    db_connector = Connector.fetch('clickhouse-database',
                                   open('../conns.yaml'))

    try:
        query_to_df(db_connector, raw_query=f"DROP TABLE {table_name}")
        print(f"Drop Table {table_name} Success!!")
    except:
        print(f"Table {table_name} does not exist.")
        pass
コード例 #4
0
def get_dimensions():
    db_connector = Connector.fetch('clickhouse-database',
                                   open('../conns.yaml'))
    dim_geo_query = 'SELECT ent_name, ent_id FROM dim_shared_geography_ent'
    dim_geo = query_to_df(db_connector, raw_query=dim_geo_query)

    dim_country_query = 'SELECT country_name_es, country_name, iso3 FROM dim_shared_country'
    dim_country = query_to_df(db_connector, raw_query=dim_country_query)

    return [dim_geo, dim_country]
コード例 #5
0
def replace_geo():
    """ query for ent, mun geo dimension"""
    db_connector = Connector.fetch('clickhouse-database', open('../conns.yaml'))

    ent = query_to_df(db_connector, 'select ent_id, ent_name from dim_shared_geography_ent')
    ent['ent_name'] = ent['ent_name'].apply(lambda x: norm(x)).str.upper()
    ent = dict(zip(ent['ent_name'], ent['ent_id']))

    mun = query_to_df(db_connector, 'select mun_id, mun_name from dim_shared_geography_mun')
    mun['mun_name'] = mun['mun_name'].apply(lambda x: norm(x)).str.upper()
    mun = dict(zip(mun['mun_name'], mun['mun_id']))

    return [ent, mun] 
コード例 #6
0
    def run_step(self, prev, params):

        df = pd.read_excel(prev, header=2, index_col=0)

        df = df[:-5]
        df = df.replace("2007 R/", 2007)

        df.iloc[1] = df.iloc[1].fillna(method='ffill')

        df.iloc[1] = df.iloc[1].apply(str)
        df.columns = (df.iloc[1] + '_' + df.iloc[2])

        df = df.reset_index()
        df = df[4:]
        df = df.dropna(axis=1, how='all')

        df = pd.melt(df, id_vars=['Miles de dólares'], var_name='time_id')

        df["time_id"] = df["time_id"].str.replace(" P/", "").str.replace('_IV','4').str.replace('_III','3').str.replace('_II','2').str.replace('_I','1').astype(int)

        df = df.rename(columns={"Miles de dólares": "ent_id"})

        df['value'] = (df['value'].astype(int))*1000

        # querying the database to get entities id's
        query = 'SELECT ent_id, ent_name FROM dim_shared_geography_ent'
        geo_ent = query_to_df(self.connector, raw_query=query).drop_duplicates('ent_id')
        ent_dict = dict(zip(geo_ent['ent_name'], geo_ent['ent_id']))

        df['ent_id'] = df['ent_id'].replace('México', 'Estado de México').replace(ent_dict).astype(int)

        return df
コード例 #7
0
    def run_step(self, prev, params):

        TOKEN = os.getenv('TOKEN')
        url = 'https://oec.world/olap-proxy/data.jsonrecords?cube=complexity_eci_a_hs{}_hs{}&drilldowns=Year%2CECI+Rank%2CCountry&measures=ECI&token={}'.format(
            params.get('version'), params.get('level'), TOKEN)
        resp = requests.get(url)
        df = pd.DataFrame(resp.json()["data"])

        df = df.drop(columns="Country")
        df = df.rename(
            columns={
                "Year": "year",
                "Country ID": "country_id",
                "ECI Rank": "eci_rank",
                "ECI": "eci"
            })

        # TODO change source in "conns"
        df["country_id"] = df["country_id"].str[2:]

        df['country_id'] = df['country_id'].replace(wrong_iso3)

        query = 'select iso3, country_name from dim_shared_country'
        countries_df = query_to_df(
            self.connector, raw_query=query).drop_duplicates(subset=['iso3'])

        for country in df.country_id.unique():
            if country not in countries_df.iso3.unique():
                print("Missing Country: {}".format(country))

        df = df[["year", "country_id", "eci_rank", "eci"]]

        return df
コード例 #8
0
    def run_step(self, prev, params):
        df = pd.read_excel(prev, header=4)

        df.dropna(inplace=True)

        df.rename(columns={'Unnamed: 0': 'ent_id'}, inplace=True)

        db_connector = Connector.fetch('clickhouse-database',
                                       open('../conns.yaml'))
        states = query_to_df(
            db_connector,
            raw_query='select ent_name, ent_id from dim_shared_geography_ent')

        df['ent_id'].replace(dict(zip(states['ent_name'], states['ent_id'])),
                             inplace=True)

        df = df.melt(id_vars='ent_id', var_name='decile')

        df.replace({'Estados Unidos Mexicanos': 33}, inplace=True)

        df['value'] = df['value'].astype(int)

        df['year'] = params.get('year')

        return df
コード例 #9
0
    def run_step(self, prev, params):

        dim_industry_query = 'SELECT sector_id, sector_es, sector_en FROM dim_shared_industry_fdi'
        df = query_to_df(self.connector, raw_query=dim_industry_query)
        df.drop_duplicates(subset='sector_id', inplace=True)

        return df
コード例 #10
0
    def run_step(self, prev, params):

        df, exp_items = prev[0], prev[1]

        df["expense_item"] = df["expense_item"].apply(
            lambda x: string.capwords(x))

        df["ent_id"] = df["expense_item"].apply(lambda x: x
                                                if x in states else np.nan)
        df["ent_id"] = df["ent_id"].ffill()
        df = df[~df["expense_item"].isin(states)]

        db_connector = Connector.fetch('clickhouse-database',
                                       open('../conns.yaml'))
        states_db = query_to_df(
            db_connector,
            raw_query='select ent_name, ent_id from dim_shared_geography_ent')
        states_db["ent_name"] = states_db["ent_name"].apply(
            lambda x: string.capwords(x))

        df.loc[df["ent_id"] == "México", "ent_id"] = "Estado De México"
        df["ent_id"].replace(dict(
            zip(states_db["ent_name"], states_db["ent_id"])),
                             inplace=True)
        df["ent_id"] = df["ent_id"].replace(nat_replacement)

        df['year'] = params.get('year')

        df['nat_id'] = df['ent_id'].astype(str).apply(lambda x: 'mex'
                                                      if x == '0' else '0')

        # Mapping expense items with id's
        df_items = pd.read_csv(exp_items)
        dict_items = dict(zip(df_items["sub_group"], df_items["sub_group_id"]))

        df = df[df["expense_item"].isin(dict_items)]
        df["expense_item"] = df["expense_item"].replace(dict_items)

        #  !!!! float values are rounding down when changing dtype. Check aggregations !!!
        for col in ["year", "expense_item", "ent_id", "expense", "mean"]:
            df[col] = df[col].astype(int)

        df = df[[
            "nat_id", "ent_id", "year", "expense_item", "expense", "mean"
        ]].copy()

        deflactores = {
            "2016": 1.1919724122410000,
            "2018": 1.0715630294111653,
            "2020": 1
        }

        # expense to 2020 currency (using enigh deflactors)
        df["expense"] = df["expense"] * deflactores[str(params.get("year"))]
        df["expense"] = df["expense"].astype(int)

        df.drop("mean", axis=1, inplace=True)

        return df
コード例 #11
0
 def run_step(self, prev_result, params):
     df = query_to_df(self.connector, 'select * from dim_shared_geography_ent',['cve_ent','ent_name','ent_id','nation_id','nation_name','nation_slug','ent_iso2','ent_iso3','ent_slug'])
     to_append = ['34','No distribuible geográficamente', 34, 'mex', 'México',None,None,None,None]
     df.loc[len(df)] = to_append
     to_append1 = ['35','En el extranjero', 35, 'mex', 'México',None,None,None,None]
     df.loc[len(df)] = to_append1
     print(df)
     return df
コード例 #12
0
    def run_step(self, prev, params):
        names = [
            'year', 'exporter', 'importer', 'hs_original_id', 'value',
            'quantity'
        ]

        year = params.get('year')
        hs_code = params.get('hs_code')

        df = pd.read_csv('{}BACI_HS{}_Y{}_V202102.csv'.format(
            prev, hs_code, year),
                         header=0,
                         names=names)

        # Trade value comes in thousands of USD
        df['value'] = df['value'] * 1000

        df['hs_original_id'] = df['hs_original_id'].astype(str).apply(
            lambda x: x.zfill(6))
        df['hs_master_id'] = df['hs_original_id'].apply(
            lambda x: int(hs6_converter(x)))

        # Converting exporter and importer IDs to ISO3 codes
        shared_countries_df = query_to_df(
            self.connector,
            'select id_num, iso3 from dim_shared_country where id_num is not null',
            ['id_num', 'iso3'])
        id_num_iso3_map = dict(
            zip(shared_countries_df['id_num'], shared_countries_df['iso3']))

        clean_id_map = dict()

        for k, v in id_num_iso3_map.items():
            for _id in k.split('|'):
                try:
                    clean_id_map[int(_id)] = v
                except ValueError:
                    # Skip NaN's
                    pass

        df['exporter'] = df['exporter'].replace(clean_id_map).astype(str)
        df['importer'] = df['importer'].replace(clean_id_map).astype(str)

        revision_name = 'hs{}'.format(params['hs_code'])

        df['hs_revision'] = REVISION_MAP[revision_name]
        df['hs_revision'] = df['hs_revision'].astype(int)

        df = df[[
            'year', 'exporter', 'importer', 'hs_master_id', 'hs_revision',
            'hs_original_id', 'value', 'quantity'
        ]]

        df['version'] = datetime.now()
        df['version'] = pd.to_datetime(df['version'],
                                       infer_datetime_format=True)

        return df
コード例 #13
0
    def run_step(self, prev, params):

        df = prev

        # deciles to single variable
        df = df.melt(id_vars=["income_source"],
                     var_name="decile",
                     value_name="value")

        df["income_source"] = df["income_source"].apply(
            lambda x: string.capwords(x))

        df["ent_id"] = df["income_source"].apply(lambda x: x
                                                 if x in states else np.nan)
        df["ent_id"] = df["ent_id"].ffill()
        df = df[~df["income_source"].isin(states)]

        db_connector = Connector.fetch('clickhouse-database',
                                       open('../conns.yaml'))
        states_db = query_to_df(
            db_connector,
            raw_query='select ent_name, ent_id from dim_shared_geography_ent')
        states_db["ent_name"] = states_db["ent_name"].apply(
            lambda x: string.capwords(x))

        df.loc[df["ent_id"] == "México", "ent_id"] = "Estado De México"
        df["ent_id"].replace(dict(
            zip(states_db["ent_name"], states_db["ent_id"])),
                             inplace=True)
        df["ent_id"] = df["ent_id"].replace(nat_replacement)

        df["income_source"] = df["income_source"].replace(income_source)

        df['year'] = params.get('year')

        df['nat_id'] = df['ent_id'].astype(str).apply(lambda x: 'mex'
                                                      if x == '0' else '0')

        df.loc[df["value"].astype(str).str.startswith("0*"), "value"] = '0'

        for col in ["year", "income_source", "ent_id", "value"]:
            df[col] = df[col].astype(int)

        df = df[[
            "nat_id", "ent_id", "year", "decile", "income_source", "value"
        ]].copy()

        deflactores = {
            "2016": 1.1919724122410000,
            "2018": 1.0715630294111653,
            "2020": 1
        }

        # income values to 2020 currency (using enigh deflactors)
        df["value"] = df["value"] * deflactores[str(params.get("year"))]
        df["value"] = df["value"].astype(int)

        return df
コード例 #14
0
    def run_step(self, prev, params):

        df = pd.read_csv(
            'https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv?cachebust=6d352e35dcffafce',
            keep_default_na=False,
            na_values=[
                "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN",
                "-nan", "1.#IND", "1.#QNAN", "<NA>", "N/A", "NULL", "NaN",
                "n/a", "nan", "null"
            ])

        # replace countries iso2 -> iso3
        db_connector = Connector.fetch('clickhouse-database',
                                       open('../conns.yaml'))
        query = 'SELECT iso2, iso3 FROM dim_shared_country'
        db_countries = query_to_df(db_connector, raw_query=query)
        db_countries = dict(zip(db_countries['iso2'], db_countries['iso3']))
        df['country_region_code'] = df['country_region_code'].str.lower()
        df['country_region_code'].replace(db_countries, inplace=True)

        # filter columns
        cols = [
            x for x in df.columns
            if x not in ['country_region', 'sub_region_1', 'sub_region_2']
        ]
        df = df.loc[:, cols].copy()

        # format date
        df['date'] = df['date'].str.replace('-', '').astype(int)

        df.rename(columns={
            'country_region_code': 'iso3',
            'date': 'date_id'
        },
                  inplace=True)

        for col in [
                'retail_and_recreation_percent_change_from_baseline',
                'grocery_and_pharmacy_percent_change_from_baseline',
                'parks_percent_change_from_baseline',
                'transit_stations_percent_change_from_baseline',
                'workplaces_percent_change_from_baseline',
                'residential_percent_change_from_baseline'
        ]:
            df[col] = df[col].astype(float)

        df = df[[
            'iso3', 'date_id',
            'retail_and_recreation_percent_change_from_baseline',
            'grocery_and_pharmacy_percent_change_from_baseline',
            'parks_percent_change_from_baseline',
            'transit_stations_percent_change_from_baseline',
            'workplaces_percent_change_from_baseline',
            'residential_percent_change_from_baseline'
        ]].copy()

        return df
コード例 #15
0
    def run_step(self, prev, params):

        pk = params.get('pk_name')
        raw_query = 'SELECT {} FROM dim_shared_industry'.format(
            params.get('columns'))
        df = query_to_df(self.connector, raw_query=raw_query)
        df.drop_duplicates(subset=[pk], inplace=True)
        df[pk] = df[pk].astype(int)

        return df
コード例 #16
0
    def run_step(self, prev, params):

        dim_industry_query = 'SELECT sector_id, sector_es, sector_en, subsector_id, subsector_es, subsector_en, \
                             industry_group_id, industry_group_es, industry_group_en FROM dim_shared_industry_fdi'

        df = query_to_df(self.connector, raw_query=dim_industry_query)
        df.drop_duplicates(subset='industry_group_id', inplace=True)

        df[['subsector_id',
            'industry_group_id']] = df[['subsector_id',
                                        'industry_group_id']].astype(int)

        return df
コード例 #17
0
    def run_step(self, prev, params):

        df = pd.read_csv(prev)

        for locale in ['es', 'en']:
            for level in [
                    'sector', 'subsector', 'industry_group', 'naics_industry',
                    'national_industry'
            ]:
                df.sort_values(by=['{}_id'.format(level)], inplace=True)
                df['{}_{}_short'.format(level,
                                        locale)] = df['{}_{}_short'.format(
                                            level, locale)].ffill()
                df['{}_{}'.format(level, locale)] = df['{}_{}'.format(
                    level, locale)].ffill()
                df.loc[df['{}_{}_short'.format(level, locale)].isna(), '{}_{}_short'.format(level, locale)] = \
                    df.loc[df['{}_{}_short'.format(level, locale)].isna(), '{}_{}'.format(level, locale)]

        # codes ids
        cols_es = list(df.columns[df.columns.str.contains('_es')])
        cols_en = list(df.columns[df.columns.str.contains('_en')])
        nltk.download('stopwords')
        stopwords_es = nltk.corpus.stopwords.words('spanish')
        df = format_text(df, cols_es, stopwords=stopwords_es)
        df = format_text(df, cols_en, stopwords=stop_words.ENGLISH_STOP_WORDS)

        for col in [
                'sector_id', 'subsector_id', 'industry_group_id',
                'naics_industry_id', 'national_industry_id'
        ]:
            df[col] = df[col].astype(str)

        # when creating the industry dimension, the cms ask for members, so 'ghost' profiles are created
        # they also appear at the search bar which the canon-cms-warmup also gets
        query = 'SELECT distinct(national_industry_id) FROM inegi_economic_census'
        query_result = query_to_df(self.connector, raw_query=query)
        query_result = list(query_result['national_industry_id'])
        print('Total ids (dimension):', df.shape[0])
        print('Ids in dimension but not in data:',
              df.loc[~df['national_industry_id'].isin(query_result)].shape[0])
        print('Total ids (data):',
              df.loc[df['national_industry_id'].isin(query_result)].shape[0])
        df = df.loc[df['national_industry_id'].isin(query_result)].copy()

        if params.get('is_dim'):
            df.drop_duplicates(subset=['national_industry_es'], inplace=True)

        return df
コード例 #18
0
    def run_step(self, prev_result, params):

        logger.info('Running Xform step...')

        df = prev_result

        df['month_id'] = df.Periodo.str.replace('/', '')
        df['trade_flow_id'] = df.name.apply(
            lambda x: 1 if 'Importaciones' in x.split() else 2)
        df['petroleum'] = df.name.apply(lambda x: 1
                                        if 'Petroleras' in x.split() else 2)
        df['country_name'] = df.name.apply(lambda x: x.split(' > ')[-1])

        dim_country = query_to_df(self.connector,
                                  'select * from dim_shared_country')

        dim_country = dim_country[['country_name_es', 'iso3']]
        iso3_dict_items_list = list(iso3_dict.items())
        df_temp = pd.DataFrame(iso3_dict_items_list,
                               columns=['country_name_es', 'iso3'])
        dim_country = pd.concat([dim_country, df_temp])

        for string in strings_to_remove:
            df.country_name = df.country_name.str.replace(string, '', 1)

        df = df[(~df.country_name.str.contains('total'))
                & ~df.country_name.str.contains('Total')]

        df = df.merge(dim_country[[
            'iso3', 'country_name_es'
        ]].drop_duplicates(subset=['country_name_es']),
                      left_on='country_name',
                      right_on='country_name_es',
                      how='left')

        df = df[['month_id', 'iso3', 'trade_flow_id', 'petroleum', 'value']]

        df = df[df.value != 'C']

        df.value = df.value.astype(float)
        df.month_id = df.month_id.astype(int)
        df.value = df.value.fillna(0)
        df.value = df.value * 1000

        return df
コード例 #19
0
def check_update(files, table):
    db_connector = Connector.fetch('clickhouse-database',
                                   open('../conns.yaml'))

    current_files = []
    for level in ['ent', 'mun']:
        try:
            query = 'SELECT distinct(url) FROM {}{}'.format(table, level)
            temp = list(query_to_df(db_connector, raw_query=query)['url'])
        except:
            temp = []
        current_files = current_files + temp

    if len(current_files) > 0:
        df = pd.DataFrame({'url': files})
        return list(df.loc[~df['url'].isin(current_files), 'url'])
    else:
        return files
コード例 #20
0
    def run_step(self, prev, params):

        if int(params.get("year")) < 2020:
            df = pd.read_excel(prev, header=4)
            df.rename(columns={'Unnamed: 0': 'ent_id'}, inplace=True)
        else:
            df = pd.read_excel(prev, header=3)
            df.rename(columns={'Entidad federativa': 'ent_id'}, inplace=True)

        df.dropna(inplace=True)

        db_connector = Connector.fetch('clickhouse-database', open('../conns.yaml'))
        states = query_to_df(db_connector, raw_query='select ent_name, ent_id from dim_shared_geography_ent')

        df['ent_id'].replace(dict(zip(states['ent_name'], states['ent_id'])), inplace=True)

        df = df.melt(id_vars='ent_id', var_name='decile')

        df.replace({'Estados Unidos Mexicanos': 33}, inplace=True)
        df.replace({'México': 15}, inplace=True)

        df['value'] = df['value'].astype(int)

        df['year'] = params.get('year')

        df['ent_id'] = df['ent_id'].replace(33, 0)
        
        df['nat_id'] = df['ent_id'].astype(str).apply(lambda x: 'mex' if x == '0' else '0')

        df = df[['nat_id', 'ent_id', 'year', 'decile', 'value']].copy()

        deflactores = {
            "2016": 1.1919724122410000,
            "2018": 1.0715630294111653,
            "2020": 1
        }

        # income values to 2020 currency (using enigh deflactors)
        df["value"] = df["value"] * deflactores[str(params.get("year"))]
        df["value"] = df["value"].astype(int)

        return df
コード例 #21
0
    def run_step(self, prev, params):

        df = prev

        replace_dict = {
            'País de origen de los ingresos por remesas, ': '',
            'País de destino de los egresos por remesas, ': ''
        }

        for key, val in replace_dict.items():
            df['iso3'] = df['iso3'].apply(lambda x: x.replace(key, val))

        df = df[~df['iso3'].str.contains('Total')]

        # time re ordering
        df['quarter_id'] = df['quarter_id'].str[-7:].str.replace('/', '')
        df['quarter_id'] = df['quarter_id'].str[-4:] + df['quarter_id'].str[:2]

        df['move'] = df['move'].replace(move_dict)

        query = 'SELECT country_name_es, iso3 FROM dim_shared_country'
        geo_country = query_to_df(self.connector, raw_query=query)

        # replacing country names with ids
        country_dict = dict(
            zip(geo_country['country_name_es'], geo_country['iso3']))
        df['iso3'] = df['iso3'].replace(country_dict).replace(wrong_countries)

        # quarters fix
        df['quarter_id'] = df['quarter_id'].str[0:4] + df['quarter_id'].str[
            4:].replace(trimester_map)

        for col in ['quarter_id', 'move']:
            df[col] = df[col].astype(int)

        # Amounts from Millions of USD to USD
        df['remittance_amount'] = df['remittance_amount'] * 1000000

        df['remittance_amount'] = df['remittance_amount'].apply(
            lambda x: round(x, 2))

        return df
コード例 #22
0
ファイル: util.py プロジェクト: anabcm/data-etl
def check_update(files, table):
    """This function check's all the url's (files path in GCP) in clickhouse and compares with the files in GCP.
    The function returns a list of all new url's (new data), or all the url's because there's no data ingested in clickhouse."""
    db_connector = Connector.fetch('clickhouse-database',
                                   open('../conns.yaml'))

    current_files = []
    for level in ['ent', 'mun']:
        try:
            query = 'SELECT distinct(url) FROM {}{}'.format(table, level)
            temp = list(query_to_df(db_connector, raw_query=query)['url'])
        except:
            temp = []
        current_files = current_files + temp

    if len(current_files) > 0:
        df = pd.DataFrame({'url': files})
        return list(df.loc[~df['url'].isin(current_files), 'url'])
    else:
        return files
コード例 #23
0
    def run_step(self, prev, params):

        df = prev

        df = df.melt(id_vars=['título', 'trade_flow_id'],
                     var_name='month_id',
                     value_name='trade_value')
        df = df.rename(columns={'título': 'iso3'})

        # month re format
        df['month_id'] = df['month_id'].str.replace('/', '').str[2:]
        df['month_id'] = df['month_id'].str[2:] + df['month_id'].str[:2]

        # replacing country names with ids
        df['iso3'] = df['iso3'].str.split(',').str[-1].str[1:]
        df = df[~df['iso3'].str.contains('Total')]

        query = 'SELECT country_name_es, iso3, continent_es FROM dim_shared_country'
        geo_country = query_to_df(self.connector, raw_query=query)

        continent_list = list(geo_country['continent_es'])
        country_dict = dict(
            zip(geo_country['country_name_es'], geo_country['iso3']))
        df = df[~df['iso3'].isin(continent_list + not_countries)]
        df['iso3'] = df['iso3'].replace(country_dict).replace(wrong_countries)

        # thousand usd to usd
        df['trade_value'] = df['trade_value'] * 1000

        for col in ['trade_flow_id', 'month_id', 'trade_value']:
            df[col] = df[col].astype(int)

        df = df.groupby(['iso3', 'trade_flow_id',
                         'month_id']).sum().reset_index()

        return df
コード例 #24
0
    def run_step(self, prev, params):
        df, raw_to_careers, careers_to_id, raw_to_institution, institution_to_id = prev
        # type format
        for col in [
                'ent_id', 'mun_id', 'career', 'type', 'campus_id',
                'institution_id', 'program'
        ]:
            df[col] = df[col].ffill()
        df.ent_id = df.ent_id.str.title()

        # ids replace from external table
        df.ent_id.replace(ORIGIN, inplace=True)

        # totals clean
        df.career = df.career.astype('str')
        for col in ['mun_id', 'career', 'type']:
            print('Current col:', col)
            df[col] = df[col].astype(str)
            df = df.loc[df[col].str.contains('Total') == False].copy()
            df[col] = df[col].str.strip().str.replace('  ', ' ').str.replace(
                ':', '')
        df.career = df.career.str.replace('.', '').astype('int')
        df['program'] = df['program'].astype(str).str.strip().str.upper()

        # municipality level id
        try:
            df.loc[:, 'mun_id'] = df.loc[:, 'ent_id'].astype(float).astype(int).astype(str) + \
                    df.loc[:, 'mun_id'].astype(float).astype(int).astype(str).str.zfill(3)
        except Exception as e:
            print('Municipality level id:', e)
            query = 'SELECT mun_id, mun_name FROM dim_shared_geography_mun'
            geo_mun = query_to_df(
                self.connector,
                raw_query=query).drop_duplicates(subset=['mun_id'])
            geo_mun['mun_name'] = geo_mun['mun_name'].str.upper()
            df['mun_id'].replace(dict(
                zip(geo_mun['mun_name'], geo_mun['mun_id'])),
                                 inplace=True)
        df.drop(columns=['ent_id'], inplace=True)

        # column names format
        df.columns = df.columns.str.replace('suma de ',
                                            '').str.replace('pni-', '')

        # 2020 rename measures
        if params.get('custom'):
            df.columns = [
                '{}-{}-{}'.format('mat',
                                  x.split('-')[2],
                                  x.split('-')[1])
                if len(x.split('-')) == 3 else x for x in df.columns
            ]
            # exceptions:
            age_range_replace = {
                'm-30-34-h': 'mat-h-30-34',
                'm-30-34-m': 'mat-m-30-34',
                'm-35-39-h': 'mat-h-35-39',
                'm-35-39-m': 'mat-m-35-39'
            }
            df.rename(columns=age_range_replace, inplace=True)

        # melt step
        age_range = [x for x in df.columns if 'mat-' in x]

        df = df[[
            'mun_id', 'career', 'type', 'campus_id', 'program',
            'institution_id'
        ] + age_range].copy()

        df.columns = df.columns.str.replace('mat-', '')

        df = df.melt(id_vars=[
            'mun_id', 'career', 'type', 'campus_id', 'program',
            'institution_id'
        ],
                     var_name='sex',
                     value_name='value')
        df = df.loc[df.value != 0]

        split = df['sex'].str.split('-', n=1, expand=True)
        df['sex'] = split[0]
        df['age'] = split[1]

        df.sex.replace(SEX, inplace=True)
        df.type = df.type.replace(TYPES).replace(type_fix)
        df.age.replace(AGE_RANGE, inplace=True)

        # stopwords es
        nltk.download('stopwords')

        # career_id replace
        # Raw rame -> processed name
        careers = pd.read_csv(raw_to_careers)
        if params.get('year_plus') == 2021:
            careers['backup'] = careers['backup'].str.upper().apply(
                lambda x: unidecode.unidecode(x))
        df.loc[df['program'].isin(['#NAME?', '#¿NOMBRE?']), 'program'] = '0'
        df = df.loc[~df['program'].str.upper().str.contains('TOTAL')].copy()
        missing_ids = df.loc[(
            ~df['program'].isin(list(careers['backup']) + ['0']))].shape[0]
        print('Missing ids (careers):', missing_ids)
        if missing_ids > 0:
            print(df.loc[~df['program'].isin(list(careers['backup']))])
            df.loc[~df['program'].isin(list(careers['backup']))].to_csv(
                'review_career.csv', index=False)
        df.program = df.program.map(dict(zip(careers.backup, careers.name_es)))
        # Processed name -> numerical id
        careers = pd.read_csv(careers_to_id)
        df.program = df.program.map(
            dict(zip(careers.backup, careers.career_id)))
        df.program = df.program.fillna(0)

        # campus_id replace
        for col in ['institution_id', 'campus_id']:
            df[col] = df[col].astype(str).str.strip()
        df['backup'] = (df['institution_id'] + df['campus_id']).str.strip()

        # Raw rame -> processed name
        raw_institution = pd.read_csv(raw_to_institution)

        # normalization
        df['backup'] = df['backup'].str.upper()
        raw_institution['backup'] = raw_institution['backup'].str.upper()

        df = df.loc[~df['backup'].str.upper().str.contains('TOTAL')].copy()
        missing_ids = df.loc[~df['backup'].isin(list(raw_institution['backup'])
                                                )]['backup'].unique()
        print('Missing ids (institutions):', len(missing_ids))
        if len(missing_ids) > 0:
            print(df.loc[~df['backup'].isin(list(raw_institution['backup']))])
            df.loc[~df['backup'].isin(list(raw_institution['backup']))].to_csv(
                'review_campus.csv', index=False)
        df.backup = df.backup.replace(
            dict(zip(raw_institution.backup, raw_institution.backup_raw)))

        institutions = pd.read_csv(institution_to_id)
        df['backup'] = df['backup'].str.upper()
        institutions['backup_raw'] = institutions['backup_raw'].str.upper()
        df.backup = df.backup.replace(
            dict(zip(institutions.backup_raw, institutions.campus_id)))

        # Processed name -> numerical id
        df['campus_id'] = df['backup'].copy()

        # replacing missing muns
        df['mun_id'] = df['mun_id'].replace(mun_fix)

        df['value'] = df['value'].replace('--', 0).fillna(0)

        for col in [
                'mun_id', 'career', 'program', 'type', 'sex', 'value', 'age',
                'campus_id'
        ]:
            df[col] = df[col].astype('int')

        df = df[[
            'mun_id', 'campus_id', 'program', 'type', 'sex', 'value', 'age'
        ]].copy()

        df['year'] = int(params.get('year_plus'))

        df.dropna(subset=['value'], inplace=True)

        return df
コード例 #25
0
    def run_step(self, prev, params):

        gdp_nat, gdp_ent = prev[0], prev[1]

        # nation trimester gdp cleaning
        gdp_nation = pd.read_csv(gdp_nat)
        gdp_nation.columns = gdp_nation.columns.str.strip().str.lower(
        ).str.replace('  ', '_').str.replace(' ', '_')
        gdp_nation = gdp_nation.melt(id_vars=['concepto'],
                                     var_name='year',
                                     value_name='gdp_nat')
        gdp_nation = gdp_nation[gdp_nation['concepto'] ==
                                'a  B.1bV - Valor agregado bruto'].reset_index(
                                    drop=True)[['year', 'gdp_nat']].copy()
        gdp_nation['year'] = gdp_nation['year'].apply(
            lambda x: x.replace('undefined_', '').replace('_|_', '').replace(
                '_p', '').replace('_r', ''))
        gdp_nation = gdp_nation[gdp_nation['year'].str[-5:] == 'anual']

        # splitting trimester by and replacing t chars
        gdp_nation['year'] = gdp_nation['year'].str[:4]

        gdp_nation = gdp_nation.groupby(['year']).sum().reset_index()

        # state trimester gdp cleaning
        gdp_ent = pd.read_csv(gdp_ent)
        gdp_ent.columns = gdp_ent.columns.str.strip().str.lower().str.replace(
            ' ', '_')
        gdp_ent = gdp_ent.iloc[6:-1].iloc[186:218, :].reset_index(drop=True)
        gdp_ent = gdp_ent.melt(
            id_vars='concepto', var_name='year',
            value_name='gdp_ent').rename(columns={'concepto': 'ent_id'})
        gdp_ent['year'] = gdp_ent['year'].str[:4]

        # querying the database to get entities id's
        query = 'SELECT ent_id, ent_name FROM dim_shared_geography_ent'
        geo_ent = query_to_df(self.connector,
                              raw_query=query).drop_duplicates('ent_id')
        ent_dict = dict(zip(geo_ent['ent_name'], geo_ent['ent_id']))

        gdp_ent['ent_id'] = gdp_ent['ent_id'].replace(
            'México', 'Estado de México').replace(ent_dict)

        # filling missing years in gdp_ent df
        for year in ['2020', '2021']:
            df_temp = pd.DataFrame(data={
                'ent_id': gdp_ent.ent_id.unique(),
                'year': year,
                'gdp_ent': np.nan
            })
            gdp_ent = pd.concat([gdp_ent, df_temp],
                                ignore_index=True).reset_index(drop=True)

        # cleaning (we won't use gdp_nation)
        gdp_nation = gdp_nation[
            gdp_nation['year'].astype(int) >= 2003].reset_index(
                drop=True).copy()
        gdp_ent = gdp_ent[~gdp_ent['gdp_ent'].isna()]
        gdp_ent['year'] = gdp_ent['year'].astype(int)

        gdp_ent['gdp_ent'] = gdp_ent['gdp_ent'].apply(lambda x: round(x, 2))

        return gdp_ent
コード例 #26
0
    # drop table
    clean_tables('gobmx_covid')

    # ingest data
    pp = CovidPipeline()
    data = sorted(glob.glob('*.csv'))
    for chunk in pd.read_csv(data[-1],
                             encoding='latin-1',
                             iterator=True,
                             chunksize=10**5):
        pp.run({'chunk': chunk})

    # update time dimension
    db_connector = Connector.fetch('clickhouse-database',
                                   open('../conns.yaml'))
    min_query = 'SELECT min(ingress_date), min(symptoms_date), min(updated_date) FROM gobmx_covid'
    max_query = 'SELECT max(ingress_date), max(symptoms_date), max(updated_date) FROM gobmx_covid'
    min_value = str(
        min(query_to_df(db_connector, raw_query=min_query).iloc[0].to_list()))
    max_value = str(
        max(query_to_df(db_connector, raw_query=max_query).iloc[0].to_list()))
    min_time = '{}-{}-{}'.format(min_value[:4], min_value[4:6], min_value[6:])
    max_time = '{}-{}-{}'.format(max_value[:4], max_value[4:6], max_value[6:])

    pp = DimTimeDatePipeline()
    pp.run({'init': min_time, 'end': max_time})

    # covid stats dim time table
    pp = DimTimeDateCovidStatsPipeline()
    pp.run({'init': min_time, 'end': max_time})
    print('Duration: {}'.format(datetime.now() - start_time))
コード例 #27
0
    def run_step(self, prev, params):
        df, raw_to_careers, careers_to_id, raw_to_institution, institution_to_id = prev
        # type format
        for col in ['ent_id', 'mun_id', 'career', 'type', 'campus_id', 'institution_id', 'program']:
            df[col] = df[col].ffill()
        df.ent_id = df.ent_id.str.title()

        # ids replace from external table
        df.ent_id.replace(ORIGIN, inplace=True)

        # totals clean
        df.career = df.career.astype('str')
        for col in ['mun_id', 'career', 'type']:
            print('Current col:', col)
            df[col] = df[col].astype(str)
            df = df.loc[df[col].str.contains('Total') == False].copy()
            df[col] = df[col].str.strip().str.replace('  ', ' ').str.replace(':', '')
        df.career = df.career.str.replace('.', '').astype('int')
        df['program'] = df['program'].astype(str).str.strip().str.upper()

        # municipality level id
        try:
            df.loc[:, 'mun_id'] = df.loc[:, 'ent_id'].astype(float).astype(int).astype(str) + \
                    df.loc[:, 'mun_id'].astype(float).astype(int).astype(str).str.zfill(3)
        except Exception as e:
            print('Municipality level id:', e)
            query = 'SELECT mun_id, mun_name FROM dim_shared_geography_mun'
            geo_mun = query_to_df(self.connector, raw_query=query).drop_duplicates(subset=['mun_id'])
            geo_mun['mun_name'] = geo_mun['mun_name'].str.upper()
            df['mun_id'].replace(dict(zip(geo_mun['mun_name'], geo_mun['mun_id'])), inplace=True)
        df.drop(columns=['ent_id'], inplace=True)

        # column names format
        df.columns = df.columns.str.replace('suma de ', '')

        # melt step
        df = df[['mun_id', 'career', 'type', 'campus_id', 'program', 'institution_id',
                 'pni-agu', 'pni-bc', 'pni-bcs', 'pni-cam', 'pni-coa',
                 'pni-col', 'pni-chia', 'pni-chih', 'pni-df', 'pni-dur', 'pni-gua',
                 'pni-gue', 'pni-hid', 'pni-jal', 'pni-mex', 'pni-mic', 'pni-mor',
                 'pni-nay', 'pni-nl', 'pni-oax', 'pni-pue', 'pni-que', 'pni-qr',
                 'pni-slp', 'pni-sin', 'pni-son', 'pni-tab', 'pni-tam', 'pni-tla',
                 'pni-ver', 'pni-yuc', 'pni-zac']].copy()

        # column names format
        df.columns = df.columns.str.replace('pni-', '')

        df = df.melt(id_vars=['mun_id', 'career', 'type', 'campus_id', 'program', 'institution_id'], var_name='origin', value_name='value')
        df = df.loc[df.value != 0]
        # external ids transfomation
        df['origin'].replace(ENT_ORIGIN, inplace=True)

        # encoding
        df.type.replace(TYPES, inplace=True)

        # stopwords es
        nltk.download('stopwords')
        # career_id replace
        # Raw rame -> processed name
        careers = pd.read_csv(raw_to_careers)
        df.loc[df['program'].isin(['#NAME?', '#¿NOMBRE?']), 'program'] = '0'
        df = df.loc[~df['program'].str.upper().str.contains('TOTAL')].copy()
        missing_ids = df.loc[(~df['program'].isin(list(careers['backup']) + ['0']))].shape[0]
        print('Missing ids (careers):', missing_ids)
        if missing_ids > 0:
            print(df.loc[~df['program'].isin(list(careers['backup']))])
            df.loc[~df['program'].isin(list(careers['backup']))].to_csv('review_career.csv', index=False)
        df.program = df.program.replace(dict(zip(careers.backup, careers.name_es)))
        # Processed name -> numerical id
        careers = pd.read_csv(careers_to_id)
        df.program = df.program.replace(dict(zip(careers.backup, careers.career_id)))

        # campus_id replace
        for col in ['institution_id', 'campus_id']:
            df[col] = df[col].astype(str).str.strip()
        df['backup'] = (df['institution_id'] + df['campus_id']).str.strip()

        # Raw rame -> processed name
        institution = pd.read_csv(raw_to_institution)
        df = df.loc[~df['backup'].str.upper().str.contains('TOTAL')].copy()
        missing_ids = df.loc[~df['backup'].isin(list(institution['backup']))].shape[0]
        print('Missing ids (institutions):', missing_ids)
        if missing_ids > 0:
            print(df.loc[~df['backup'].isin(list(institution['backup']))])
            df.loc[~df['backup'].isin(list(institution['backup']))].to_csv('review_campus.csv', index=False)
        df.backup = df.backup.replace(dict(zip(institution.backup, institution.backup_raw)))
        # Processed name -> numerical id
        institution = pd.read_csv(institution_to_id)
        df.backup = df.backup.replace(dict(zip(institution.backup_raw, institution.campus_id)))
        df['campus_id'] = df['backup']

        for col in ['mun_id', 'program', 'type', 'origin', 'value', 'campus_id']:
            print('Current col:', col)
            df[col] = df[col].astype('float')

        df = df[['mun_id', 'campus_id', 'program', 'type', 'origin', 'value']].copy()

        df['year'] = int(params.get('year_plus'))

        return df
コード例 #28
0
    def run_step(self, prev, params):

        BASE_URL = os.environ.get('BASE_URL')
        #Data from each report
        def _report(url):
            headers = {
                "Cache-Control": "no-cache",
                "Pragma": "no-cache"
            }
            r = requests.get(url, headers=headers)
            data_json = r.json()

            report = pd.DataFrame(data_json["data"])
            report = report[report["Municipality"] != "No Informado"]
            
            report = report.drop(columns={"Updated Date", "Municipality"})
            report = report.rename(columns={"Updated Date ID": "time_id", "Municipality ID":"mun_id"})
            
            df_temp = []
            for a, df_a in report.groupby("mun_id"):
                _df = df_a.copy()
                _df = _df.sort_values("time_id")
                _df["reported_cases"] = _df["Cases"].diff().fillna(0).astype(int)

                df_temp.append(_df)
            df_temp = pd.concat(df_temp, sort=False)
            
            return df_temp

        report_cases = _report(BASE_URL + "/data.jsonrecords?Covid+Result=1&cube=gobmx_covid&drilldowns=Updated+Date%2CMunicipality&measures=Cases&parents=false&sparse=false")
        report_cases = report_cases.rename(columns={"reported_cases":"new_cases_report", "Cases":"accum_cases_report"})

        report_deaths = _report(BASE_URL + "/data.jsonrecords?Covid+Result=1&Is+Dead=1&cube=gobmx_covid&drilldowns=Updated+Date%2CMunicipality&measures=Cases&parents=false&sparse=false")
        report_deaths = report_deaths.rename(columns={"reported_cases":"new_deaths_report", "Cases":"accum_deaths_report"})

        report_hospitalized = _report(BASE_URL + "/data.jsonrecords?Covid+Result=1&Patient+Type=2&cube=gobmx_covid&drilldowns=Updated+Date%2CMunicipality&measures=Cases&parents=false&sparse=false")
        report_hospitalized = report_hospitalized.rename(columns={"reported_cases":"new_hospitalized_report", "Cases":"accum_hospitalized_report"})

        report_suspect = _report(BASE_URL + "/data.jsonrecords?Covid+Result=3&cube=gobmx_covid&drilldowns=Updated+Date%2CMunicipality&measures=Cases&parents=false&sparse=false")
        report_suspect = report_suspect.rename(columns={"reported_cases":"new_suspect_report", "Cases":"accum_suspect_report"})

        report = pd.merge(report_cases, report_deaths, how="outer", on=["time_id", "mun_id"])
        report = pd.merge(report, report_hospitalized, how="outer", on=["time_id", "mun_id"])
        report = pd.merge(report, report_suspect, how="outer", on=["time_id", "mun_id"])

        #Create dictionary for state and population
        r = requests.get(BASE_URL + "/data?Year=2020&cube=population_projection&drilldowns=Municipality&measures=Projected+Population&parents=false&sparse=false")
        data_json = r.json()
        states_data = pd.DataFrame(data_json["data"])
        dicto_mun_population = dict(zip(states_data["Municipality ID"], states_data["Projected Population"]))

        #Data from latest report
        data = sorted(glob.glob("*.csv"))
        df = pd.read_csv(data[-1], encoding="latin-1")
        df.columns = [x.strip().lower().replace(" ", "_") for x in df.columns]

        df["entidad_res"] = df["entidad_res"].astype(str).str.zfill(2)
        df["municipio_res"] = df["municipio_res"].astype(str).str.zfill(3)
        df["mun_id"] = (df["entidad_res"] + df["municipio_res"]).astype(int)

        # ids refactor
        df.loc[df['clasificacion_final'].isin([1,2,3]), 'resultado_lab'] = 1
        df.loc[df['clasificacion_final'].isin([4,5,6]), 'resultado_lab'] = 3
        df.loc[df['clasificacion_final'] == 7, 'resultado_lab'] = 2

        #Suspect cases
        df_suspect = df[["mun_id", "fecha_ingreso", "resultado_lab"]]
        df_suspect = df_suspect[df_suspect["resultado_lab"] == 3]
        df_suspect["resultado_lab"] = df_suspect["resultado_lab"].replace(3,1)
        df_suspect = df_suspect.rename(columns={"fecha_ingreso":"time_id", "resultado_lab":"daily_suspect"})
        df_suspect = df_suspect.groupby(["mun_id","time_id"]).sum().reset_index()

        #Hospitalized
        df_hosp = df[["mun_id", "fecha_ingreso", "resultado_lab", "tipo_paciente"]]
        df_hosp = df_hosp[(df_hosp["resultado_lab"] == 1) & (df_hosp["tipo_paciente"] == 2)]
        df_hosp = df_hosp.drop(columns="tipo_paciente")
        df_hosp = df_hosp.rename(columns={"fecha_ingreso":"time_id", "resultado_lab":"daily_hospitalized"})
        df_hosp = df_hosp.groupby(["mun_id","time_id"]).sum().reset_index()

        #Cases
        df1 = df[["mun_id", "fecha_ingreso", "resultado_lab"]]
        df1 = df1[df1["resultado_lab"] == 1]
        df1 = df1.rename(columns={"fecha_ingreso":"time_id", "resultado_lab":"daily_cases"})
        df1 = df1.groupby(["mun_id","time_id"]).sum().reset_index()

        df1 = pd.merge(df1, df_hosp, how="outer", on=["mun_id","time_id"])
        df1 = pd.merge(df1, df_suspect, how="outer", on=["mun_id","time_id"])
        df1["time_id"] = pd.to_datetime(df1["time_id"])

        # Replace unknown municipalities
        mun = query_to_df(self.connector, "select mun_id from dim_shared_geography_mun")
        df1.loc[~df1["mun_id"].isin(mun['mun_id']), "mun_id"] = 33000

        df1 = df1.groupby(["mun_id", "time_id"]).sum().reset_index()
        df1["time_id"] = pd.to_datetime(df1["time_id"])

        # Add missing dates daily cases
        df1_ = []
        last_day = df1["time_id"].max()
        for a, df_a in df1.groupby("mun_id"):
            
            first_day = df_a["time_id"].min()
            idx = pd.date_range(first_day, last_day)
            df_b = df_a.reindex(idx)
            df_b = pd.DataFrame(df_b.index)
            df_b = df_b.rename(columns={0: "time_id"})
            
            result = pd.merge(df_b, df_a, how="outer", on="time_id")
            
            df1_.append(result)
        df1_ = pd.concat(df1_, sort=False)

        df1_["mun_id"] = df1_["mun_id"].fillna(method="ffill")

        #Deaths
        # fix max death date
        max_current_date = df['fecha_ingreso'].max()
        df.loc[df['fecha_def'] > max_current_date, 'fecha_def'] = "9999-99-99"
        df2 = df[["mun_id", "fecha_ingreso", "fecha_def", "resultado_lab"]]

        df2 = df2.rename(columns={"fecha_ingreso": "ingress_date", 
                                  "fecha_def": "death_date", 
                                  "resultado_lab": "daily_deaths"})

        df2 = df2[df2["daily_deaths"] == 1]
        df2 = df2[df2["death_date"]!= "9999-99-99"]

        for i in ["ingress_date", "death_date"]:
            df2[i] = pd.to_datetime(df2[i])
            
        df2["days_between_ingress_and_death"] = df2["death_date"] - df2["ingress_date"]
        df2["days_between_ingress_and_death"] = df2["days_between_ingress_and_death"].dt.days

        df2 = df2.drop(columns="ingress_date")
        df2 = df2.rename(columns={"death_date":"time_id"})

        df2 = df2.groupby(["mun_id", "time_id"]).agg({"daily_deaths": "sum", "days_between_ingress_and_death": "mean"}).reset_index()

        ##Add missing dates deaths 
        df2_ = []
        last_day = df2["time_id"].max()
        for a, df_a in df2.groupby("mun_id"):
            
            first_day = df_a["time_id"].min()
            idx = pd.date_range(first_day, last_day)
            df_b = df_a.reindex(idx)
            df_b = pd.DataFrame(df_b.index)
            df_b = df_b.rename(columns={0:"time_id"})
            
            result = pd.merge(df_b, df_a, how="outer", on="time_id")
            
            df2_.append(result)
        df2_ = pd.concat(df2_, sort=False)

        df2_["mun_id"] = df2_["mun_id"].fillna(method="ffill")

        #Merge daily cases and deaths
        data = pd.merge(df1_, df2_, how="outer", on=["time_id", "mun_id"])
        data = data.sort_values(by=["time_id", "mun_id"])

        data["time_id"] = data["time_id"].astype(str).str.replace("-", "").astype(int)

        #Merge data latest report with data from each report
        data = pd.merge(data, report, how="outer", on=["time_id", "mun_id"])

        for col in ["daily_cases", "daily_hospitalized", "daily_suspect", "daily_deaths", "accum_cases_report", "new_cases_report", "accum_deaths_report", "new_deaths_report", "accum_hospitalized_report", "new_hospitalized_report", "accum_suspect_report", "new_suspect_report"]:
            data[col] = data[col].fillna(0).astype(int)

        #Add column of accumulated cases
        df_final = []
        for a, df_a in data.groupby("mun_id"):
            # create temporal df to silence "SettingWithCopyWarning"
            _df = df_a.copy()
            _df["accum_cases"] = _df["daily_cases"].cumsum()
            _df["accum_deaths"] = _df["daily_deaths"].cumsum()
            _df["accum_hospitalized"] = _df.daily_hospitalized.cumsum()
            _df["accum_suspect"] = _df.daily_suspect.cumsum()

            for i in ["daily_cases", "accum_cases", "daily_deaths", "accum_deaths", "accum_cases_report", "new_cases_report", "accum_deaths_report", "new_deaths_report"]:
                measure = "avg7_{}".format(i)
                _df[measure] = _df[i].rolling(7).mean()
                
            for j in ["daily_cases", "accum_cases", "daily_deaths", "accum_deaths", "accum_cases_report", "new_cases_report", "accum_deaths_report", "new_deaths_report"]:
                measure = "sum_last7_{}".format(j)
                _df[measure] = _df[j].rolling(7).sum()

            df_final.append(_df)
        df_final = pd.concat(df_final, sort=False)

        for k in ["sum_last7_daily_cases", "sum_last7_accum_cases", "sum_last7_daily_deaths", "sum_last7_accum_deaths", "sum_last7_accum_cases_report", "sum_last7_new_cases_report", "sum_last7_accum_deaths_report", "sum_last7_new_deaths_report"]:
            df_final[k] = df_final[k].fillna(0).astype(int)

        #Rate per 100.000 inhabitans
        df_final["population_temp"] = df_final["mun_id"].replace(dicto_mun_population)
        df_final["population"] = np.where(df_final["population_temp"] != df_final["mun_id"], df_final["population_temp"], np.nan)

        df_final["rate_daily_cases"] = (df_final["daily_cases"] / df_final["population"]) * 100000
        df_final["rate_accum_cases"] = (df_final["accum_cases"] / df_final["population"]) * 100000
        df_final["rate_new_cases_report"] = (df_final["new_cases_report"] / df_final["population"]) * 100000
        df_final["rate_accum_cases_report"] = (df_final["accum_cases_report"] / df_final["population"]) * 100000

        df_final["rate_daily_deaths"] = (df_final["daily_deaths"] / df_final["population"]) * 100000
        df_final["rate_accum_deaths"] = (df_final["accum_deaths"] / df_final["population"]) * 100000
        df_final["rate_new_deaths_report"] = (df_final["new_cases_report"] / df_final["population"]) * 100000
        df_final["rate_accum_deaths_report"] = (df_final["accum_deaths_report"] / df_final["population"]) * 100000

        df_final = df_final.drop(columns={"population", "population_temp"})
            
        #Add a column with days, being day one when at least 50 cases accumulate
        day = []
        for a, df_a in df_final.groupby("mun_id"):
            default = 0
            for row in df_a.iterrows():
                if row[1]["accum_cases"] >= 50:
                    default = default + 1

                day.append(default)
        df_final["day_from_50_cases"] = day

        #Add a column with days, being day one when at least 10 deaths accumulate
        day = []
        for a, df_a in df_final.groupby("mun_id"):
            default = 0
            for row in df_a.iterrows():
                if row[1]["accum_deaths"] >= 10:
                    default = default + 1

                day.append(default)
        df_final["day_from_10_deaths"] = day
        
        df_final["mun_id"] = df_final["mun_id"].astype(int)

        print(datetime.now(), df_final["time_id"].max())
        
        return df_final
コード例 #29
0
ファイル: status_pipeline.py プロジェクト: anabcm/data-etl
    def run_step(self, prev, params):
        df, raw_to_careers, careers_to_id, raw_to_institution, institution_to_id = prev
        # type format
        for col in [
                'ent_id', 'mun_id', 'career', 'type', 'campus_id',
                'institution_id', 'program'
        ]:
            df[col] = df[col].ffill()
        df.ent_id = df.ent_id.str.title()

        # ids replace from external table
        df.ent_id.replace(ORIGIN, inplace=True)

        # totals clean
        df.career = df.career.astype('str')
        for col in ['mun_id', 'career', 'type']:
            print('Current col:', col)
            df[col] = df[col].astype(str)
            df = df.loc[df[col].str.contains('Total') == False].copy()
            df[col] = df[col].str.strip().str.replace('  ', ' ').str.replace(
                ':', '')
        df.career = df.career.str.replace('.', '').astype('int')
        df['program'] = df['program'].astype(str).str.strip().str.upper()

        # municipality level id
        try:
            df.loc[:, 'mun_id'] = df.loc[:, 'ent_id'].astype(float).astype(int).astype(str) + \
                    df.loc[:, 'mun_id'].astype(float).astype(int).astype(str).str.zfill(3)
        except Exception as e:
            print('Municipality level id:', e)
            query = 'SELECT mun_id, mun_name FROM dim_shared_geography_mun'
            geo_mun = query_to_df(
                self.connector,
                raw_query=query).drop_duplicates(subset=['mun_id'])
            geo_mun['mun_name'] = geo_mun['mun_name'].str.upper()
            df['mun_id'].replace(dict(
                zip(geo_mun['mun_name'], geo_mun['mun_id'])),
                                 inplace=True)
        df.drop(columns=['ent_id'], inplace=True)

        # column names format
        df.columns = df.columns.str.replace('suma de ', '')

        # melt step
        try:
            df = df[[
                'mun_id', 'career', 'type', 'campus_id', 'institution_id',
                'program', 'e-h', 'e-m', 'g-h', 'g-m'
            ]].copy()

        except:
            df = df[[
                'mun_id', 'career', 'type', 'campus_id', 'institution_id',
                'program', 'e-h', 'e-m', 't-h', 't-m'
            ]].copy()

        df = df.melt(id_vars=[
            'mun_id', 'career', 'type', 'campus_id', 'institution_id',
            'program'
        ],
                     var_name='stat',
                     value_name='value')
        df = df.loc[df.value != 0]

        split = df['stat'].str.split('-', n=1, expand=True)
        df['stat'] = split[0]
        df['sex'] = split[1]

        # encoding
        df.stat.replace(STAT, inplace=True)
        df.sex.replace(SEX, inplace=True)
        df.type.replace(TYPES, inplace=True)

        # 2020 version status fix
        df.loc[(df['type'].isin([12, 13, 14])) & (df['stat'] == 3), 'stat'] = 2

        # stopwords es
        nltk.download('stopwords')

        # career_id replace
        # Raw rame -> processed name
        careers = pd.read_csv(raw_to_careers)
        df.loc[df['program'].isin(['#NAME?', '#¿NOMBRE?']), 'program'] = '0'
        df = df.loc[~df['program'].str.upper().str.contains('TOTAL')].copy()
        missing_ids = df.loc[(
            ~df['program'].isin(list(careers['backup']) + ['0']))].shape[0]
        print('Missing ids (careers):', missing_ids)
        if missing_ids > 0:
            print(df.loc[~df['program'].isin(list(careers['backup']))])
            df.loc[~df['program'].isin(list(careers['backup']))].to_csv(
                'review_career.csv', index=False)
        df.program = df.program.replace(
            dict(zip(careers.backup, careers.name_es)))
        # Processed name -> numerical id
        careers = pd.read_csv(careers_to_id)
        df.program = df.program.replace(
            dict(zip(careers.backup, careers.career_id)))

        # campus_id replace
        for col in ['institution_id', 'campus_id']:
            df[col] = df[col].astype(str).str.strip()
        df['backup'] = (df['institution_id'] + df['campus_id']).str.strip()

        # Raw rame -> processed name
        institution = pd.read_csv(raw_to_institution)
        df = df.loc[~df['backup'].str.upper().str.contains('TOTAL')].copy()
        missing_ids = df.loc[~df['backup'].isin(list(institution['backup'])
                                                )].shape[0]
        print('Missing ids (institutions):', missing_ids)
        if missing_ids > 0:
            print(df.loc[~df['backup'].isin(list(institution['backup']))])
            df.loc[~df['backup'].isin(list(institution['backup']))].to_csv(
                'review_campus.csv', index=False)
        df.backup = df.backup.replace(
            dict(zip(institution.backup, institution.backup_raw)))
        # Processed name -> numerical id
        institution = pd.read_csv(institution_to_id)
        df.backup = df.backup.replace(
            dict(zip(institution.backup_raw, institution.campus_id)))
        df['campus_id'] = df['backup']

        for col in [
                'mun_id', 'career', 'program', 'type', 'sex', 'value', 'stat',
                'campus_id'
        ]:
            print('Current col:', col)
            df[col] = df[col].astype('float')

        df = df[[
            'mun_id', 'campus_id', 'program', 'type', 'sex', 'value', 'stat'
        ]].copy()

        df['year'] = int(params.get('year_plus'))

        return df
コード例 #30
0
import pandas as pd
from tabulate import tabulate
from bamboo_lib.helpers import grab_connector, query_to_df

db_connector = grab_connector(__file__, "clickhouse-remote")

# Get table names
names_df = query_to_df(db_connector, "SHOW TABLES")
tables = names_df["name"].tolist()

# Print row count
for t in [table for table in tables if "acs" in table]:
    print("\nTABLE: {}".format(t))

    query = "SELECT year, COUNT(*) FROM {} GROUP BY year ORDER BY year;".format(
        t)
    df = query_to_df(db_connector, query, col_headers=["year", "count"])
    new_df = pd.DataFrame({
        "year": df["year"].tolist(),
        "count": ["{:,}".format(int(c)) for c in df["count"]]
    })
    print(tabulate(new_df, headers="keys", tablefmt="psql", showindex=False))