def clean_tables(table): db_connector = Connector.fetch('clickhouse-database', open('../conns.yaml')) query = 'DROP TABLE {}'.format(table) query_to_df(db_connector, raw_query=query) print('Success! {}'.format(table)) return 0
def run_step(self, prev, params): df = pd.read_excel(prev, sheet_name='CeNaPI al 17 nov 2021') df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_") df.columns = [ 'park_name_id', 'park_name', 'park_type_id', 'ent_id', 'mun_id', 'companies' ] df = df.drop_duplicates(['park_name', 'ent_id', 'mun_id']) # replacing park names with id's --> to check for new parks query = 'SELECT park_id, park_name FROM dim_names_industrial_parks' geo_mun = query_to_df( self.connector, raw_query=query).drop_duplicates(subset=['park_id']) names_dict = dict(zip(geo_mun['park_name'], geo_mun['park_id'])) # check differences in park ids df['park_name'] = df['park_name'].replace(names_dict) print(df.loc[df['park_name_id'] != df['park_name']]) # set park type id df["park_type_id"] = df["park_type_id"].replace(park_type) # replacing mun names with id's query = 'SELECT ent_id, ent_name, mun_id, mun_name FROM dim_shared_geography_mun' geo_mun = query_to_df( self.connector, raw_query=query).drop_duplicates(subset=['mun_id']) ent_dict = dict(zip(geo_mun['ent_name'], geo_mun['ent_id'])) df.loc[df['ent_id'] == 'México', 'ent_id'] = 'Estado de México' df['ent_id'] = df['ent_id'].replace(ent_dict) # selective replacement (as two municipalities of different entities may have same name) df_final = pd.DataFrame() for entity in list(df['ent_id'].unique()): geo_mun_aux = geo_mun[geo_mun['ent_id'] == entity] mun_dict = dict(zip(geo_mun_aux['mun_name'], geo_mun_aux['mun_id'])) df_temp = df[df['ent_id'] == entity].copy() df_temp['mun_id'] = df_temp['mun_id'].replace(mun_dict) df_final = pd.concat([df_final, df_temp], ignore_index=True) df_final['mun_id'] = df_final['mun_id'].replace(wrong_muns).astype(int) df_final["count_parks"] = 1 df_final["companies"] = df_final["companies"].fillna(0).astype(int) df_final.drop(['ent_id', 'park_name'], axis=1, inplace=True) df_final = df_final.groupby(['mun_id', 'park_name_id', 'park_type_id']).sum().reset_index() df_final.loc[df_final['park_name_id'] == 703, 'companies'] = 12 return df_final
def clean_tables(table_name: str): db_connector = Connector.fetch('clickhouse-database', open('../conns.yaml')) try: query_to_df(db_connector, raw_query=f"DROP TABLE {table_name}") print(f"Drop Table {table_name} Success!!") except: print(f"Table {table_name} does not exist.") pass
def get_dimensions(): db_connector = Connector.fetch('clickhouse-database', open('../conns.yaml')) dim_geo_query = 'SELECT ent_name, ent_id FROM dim_shared_geography_ent' dim_geo = query_to_df(db_connector, raw_query=dim_geo_query) dim_country_query = 'SELECT country_name_es, country_name, iso3 FROM dim_shared_country' dim_country = query_to_df(db_connector, raw_query=dim_country_query) return [dim_geo, dim_country]
def replace_geo(): """ query for ent, mun geo dimension""" db_connector = Connector.fetch('clickhouse-database', open('../conns.yaml')) ent = query_to_df(db_connector, 'select ent_id, ent_name from dim_shared_geography_ent') ent['ent_name'] = ent['ent_name'].apply(lambda x: norm(x)).str.upper() ent = dict(zip(ent['ent_name'], ent['ent_id'])) mun = query_to_df(db_connector, 'select mun_id, mun_name from dim_shared_geography_mun') mun['mun_name'] = mun['mun_name'].apply(lambda x: norm(x)).str.upper() mun = dict(zip(mun['mun_name'], mun['mun_id'])) return [ent, mun]
def run_step(self, prev, params): df = pd.read_excel(prev, header=2, index_col=0) df = df[:-5] df = df.replace("2007 R/", 2007) df.iloc[1] = df.iloc[1].fillna(method='ffill') df.iloc[1] = df.iloc[1].apply(str) df.columns = (df.iloc[1] + '_' + df.iloc[2]) df = df.reset_index() df = df[4:] df = df.dropna(axis=1, how='all') df = pd.melt(df, id_vars=['Miles de dólares'], var_name='time_id') df["time_id"] = df["time_id"].str.replace(" P/", "").str.replace('_IV','4').str.replace('_III','3').str.replace('_II','2').str.replace('_I','1').astype(int) df = df.rename(columns={"Miles de dólares": "ent_id"}) df['value'] = (df['value'].astype(int))*1000 # querying the database to get entities id's query = 'SELECT ent_id, ent_name FROM dim_shared_geography_ent' geo_ent = query_to_df(self.connector, raw_query=query).drop_duplicates('ent_id') ent_dict = dict(zip(geo_ent['ent_name'], geo_ent['ent_id'])) df['ent_id'] = df['ent_id'].replace('México', 'Estado de México').replace(ent_dict).astype(int) return df
def run_step(self, prev, params): TOKEN = os.getenv('TOKEN') url = 'https://oec.world/olap-proxy/data.jsonrecords?cube=complexity_eci_a_hs{}_hs{}&drilldowns=Year%2CECI+Rank%2CCountry&measures=ECI&token={}'.format( params.get('version'), params.get('level'), TOKEN) resp = requests.get(url) df = pd.DataFrame(resp.json()["data"]) df = df.drop(columns="Country") df = df.rename( columns={ "Year": "year", "Country ID": "country_id", "ECI Rank": "eci_rank", "ECI": "eci" }) # TODO change source in "conns" df["country_id"] = df["country_id"].str[2:] df['country_id'] = df['country_id'].replace(wrong_iso3) query = 'select iso3, country_name from dim_shared_country' countries_df = query_to_df( self.connector, raw_query=query).drop_duplicates(subset=['iso3']) for country in df.country_id.unique(): if country not in countries_df.iso3.unique(): print("Missing Country: {}".format(country)) df = df[["year", "country_id", "eci_rank", "eci"]] return df
def run_step(self, prev, params): df = pd.read_excel(prev, header=4) df.dropna(inplace=True) df.rename(columns={'Unnamed: 0': 'ent_id'}, inplace=True) db_connector = Connector.fetch('clickhouse-database', open('../conns.yaml')) states = query_to_df( db_connector, raw_query='select ent_name, ent_id from dim_shared_geography_ent') df['ent_id'].replace(dict(zip(states['ent_name'], states['ent_id'])), inplace=True) df = df.melt(id_vars='ent_id', var_name='decile') df.replace({'Estados Unidos Mexicanos': 33}, inplace=True) df['value'] = df['value'].astype(int) df['year'] = params.get('year') return df
def run_step(self, prev, params): dim_industry_query = 'SELECT sector_id, sector_es, sector_en FROM dim_shared_industry_fdi' df = query_to_df(self.connector, raw_query=dim_industry_query) df.drop_duplicates(subset='sector_id', inplace=True) return df
def run_step(self, prev, params): df, exp_items = prev[0], prev[1] df["expense_item"] = df["expense_item"].apply( lambda x: string.capwords(x)) df["ent_id"] = df["expense_item"].apply(lambda x: x if x in states else np.nan) df["ent_id"] = df["ent_id"].ffill() df = df[~df["expense_item"].isin(states)] db_connector = Connector.fetch('clickhouse-database', open('../conns.yaml')) states_db = query_to_df( db_connector, raw_query='select ent_name, ent_id from dim_shared_geography_ent') states_db["ent_name"] = states_db["ent_name"].apply( lambda x: string.capwords(x)) df.loc[df["ent_id"] == "México", "ent_id"] = "Estado De México" df["ent_id"].replace(dict( zip(states_db["ent_name"], states_db["ent_id"])), inplace=True) df["ent_id"] = df["ent_id"].replace(nat_replacement) df['year'] = params.get('year') df['nat_id'] = df['ent_id'].astype(str).apply(lambda x: 'mex' if x == '0' else '0') # Mapping expense items with id's df_items = pd.read_csv(exp_items) dict_items = dict(zip(df_items["sub_group"], df_items["sub_group_id"])) df = df[df["expense_item"].isin(dict_items)] df["expense_item"] = df["expense_item"].replace(dict_items) # !!!! float values are rounding down when changing dtype. Check aggregations !!! for col in ["year", "expense_item", "ent_id", "expense", "mean"]: df[col] = df[col].astype(int) df = df[[ "nat_id", "ent_id", "year", "expense_item", "expense", "mean" ]].copy() deflactores = { "2016": 1.1919724122410000, "2018": 1.0715630294111653, "2020": 1 } # expense to 2020 currency (using enigh deflactors) df["expense"] = df["expense"] * deflactores[str(params.get("year"))] df["expense"] = df["expense"].astype(int) df.drop("mean", axis=1, inplace=True) return df
def run_step(self, prev_result, params): df = query_to_df(self.connector, 'select * from dim_shared_geography_ent',['cve_ent','ent_name','ent_id','nation_id','nation_name','nation_slug','ent_iso2','ent_iso3','ent_slug']) to_append = ['34','No distribuible geográficamente', 34, 'mex', 'México',None,None,None,None] df.loc[len(df)] = to_append to_append1 = ['35','En el extranjero', 35, 'mex', 'México',None,None,None,None] df.loc[len(df)] = to_append1 print(df) return df
def run_step(self, prev, params): names = [ 'year', 'exporter', 'importer', 'hs_original_id', 'value', 'quantity' ] year = params.get('year') hs_code = params.get('hs_code') df = pd.read_csv('{}BACI_HS{}_Y{}_V202102.csv'.format( prev, hs_code, year), header=0, names=names) # Trade value comes in thousands of USD df['value'] = df['value'] * 1000 df['hs_original_id'] = df['hs_original_id'].astype(str).apply( lambda x: x.zfill(6)) df['hs_master_id'] = df['hs_original_id'].apply( lambda x: int(hs6_converter(x))) # Converting exporter and importer IDs to ISO3 codes shared_countries_df = query_to_df( self.connector, 'select id_num, iso3 from dim_shared_country where id_num is not null', ['id_num', 'iso3']) id_num_iso3_map = dict( zip(shared_countries_df['id_num'], shared_countries_df['iso3'])) clean_id_map = dict() for k, v in id_num_iso3_map.items(): for _id in k.split('|'): try: clean_id_map[int(_id)] = v except ValueError: # Skip NaN's pass df['exporter'] = df['exporter'].replace(clean_id_map).astype(str) df['importer'] = df['importer'].replace(clean_id_map).astype(str) revision_name = 'hs{}'.format(params['hs_code']) df['hs_revision'] = REVISION_MAP[revision_name] df['hs_revision'] = df['hs_revision'].astype(int) df = df[[ 'year', 'exporter', 'importer', 'hs_master_id', 'hs_revision', 'hs_original_id', 'value', 'quantity' ]] df['version'] = datetime.now() df['version'] = pd.to_datetime(df['version'], infer_datetime_format=True) return df
def run_step(self, prev, params): df = prev # deciles to single variable df = df.melt(id_vars=["income_source"], var_name="decile", value_name="value") df["income_source"] = df["income_source"].apply( lambda x: string.capwords(x)) df["ent_id"] = df["income_source"].apply(lambda x: x if x in states else np.nan) df["ent_id"] = df["ent_id"].ffill() df = df[~df["income_source"].isin(states)] db_connector = Connector.fetch('clickhouse-database', open('../conns.yaml')) states_db = query_to_df( db_connector, raw_query='select ent_name, ent_id from dim_shared_geography_ent') states_db["ent_name"] = states_db["ent_name"].apply( lambda x: string.capwords(x)) df.loc[df["ent_id"] == "México", "ent_id"] = "Estado De México" df["ent_id"].replace(dict( zip(states_db["ent_name"], states_db["ent_id"])), inplace=True) df["ent_id"] = df["ent_id"].replace(nat_replacement) df["income_source"] = df["income_source"].replace(income_source) df['year'] = params.get('year') df['nat_id'] = df['ent_id'].astype(str).apply(lambda x: 'mex' if x == '0' else '0') df.loc[df["value"].astype(str).str.startswith("0*"), "value"] = '0' for col in ["year", "income_source", "ent_id", "value"]: df[col] = df[col].astype(int) df = df[[ "nat_id", "ent_id", "year", "decile", "income_source", "value" ]].copy() deflactores = { "2016": 1.1919724122410000, "2018": 1.0715630294111653, "2020": 1 } # income values to 2020 currency (using enigh deflactors) df["value"] = df["value"] * deflactores[str(params.get("year"))] df["value"] = df["value"].astype(int) return df
def run_step(self, prev, params): df = pd.read_csv( 'https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv?cachebust=6d352e35dcffafce', keep_default_na=False, na_values=[ "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "<NA>", "N/A", "NULL", "NaN", "n/a", "nan", "null" ]) # replace countries iso2 -> iso3 db_connector = Connector.fetch('clickhouse-database', open('../conns.yaml')) query = 'SELECT iso2, iso3 FROM dim_shared_country' db_countries = query_to_df(db_connector, raw_query=query) db_countries = dict(zip(db_countries['iso2'], db_countries['iso3'])) df['country_region_code'] = df['country_region_code'].str.lower() df['country_region_code'].replace(db_countries, inplace=True) # filter columns cols = [ x for x in df.columns if x not in ['country_region', 'sub_region_1', 'sub_region_2'] ] df = df.loc[:, cols].copy() # format date df['date'] = df['date'].str.replace('-', '').astype(int) df.rename(columns={ 'country_region_code': 'iso3', 'date': 'date_id' }, inplace=True) for col in [ 'retail_and_recreation_percent_change_from_baseline', 'grocery_and_pharmacy_percent_change_from_baseline', 'parks_percent_change_from_baseline', 'transit_stations_percent_change_from_baseline', 'workplaces_percent_change_from_baseline', 'residential_percent_change_from_baseline' ]: df[col] = df[col].astype(float) df = df[[ 'iso3', 'date_id', 'retail_and_recreation_percent_change_from_baseline', 'grocery_and_pharmacy_percent_change_from_baseline', 'parks_percent_change_from_baseline', 'transit_stations_percent_change_from_baseline', 'workplaces_percent_change_from_baseline', 'residential_percent_change_from_baseline' ]].copy() return df
def run_step(self, prev, params): pk = params.get('pk_name') raw_query = 'SELECT {} FROM dim_shared_industry'.format( params.get('columns')) df = query_to_df(self.connector, raw_query=raw_query) df.drop_duplicates(subset=[pk], inplace=True) df[pk] = df[pk].astype(int) return df
def run_step(self, prev, params): dim_industry_query = 'SELECT sector_id, sector_es, sector_en, subsector_id, subsector_es, subsector_en, \ industry_group_id, industry_group_es, industry_group_en FROM dim_shared_industry_fdi' df = query_to_df(self.connector, raw_query=dim_industry_query) df.drop_duplicates(subset='industry_group_id', inplace=True) df[['subsector_id', 'industry_group_id']] = df[['subsector_id', 'industry_group_id']].astype(int) return df
def run_step(self, prev, params): df = pd.read_csv(prev) for locale in ['es', 'en']: for level in [ 'sector', 'subsector', 'industry_group', 'naics_industry', 'national_industry' ]: df.sort_values(by=['{}_id'.format(level)], inplace=True) df['{}_{}_short'.format(level, locale)] = df['{}_{}_short'.format( level, locale)].ffill() df['{}_{}'.format(level, locale)] = df['{}_{}'.format( level, locale)].ffill() df.loc[df['{}_{}_short'.format(level, locale)].isna(), '{}_{}_short'.format(level, locale)] = \ df.loc[df['{}_{}_short'.format(level, locale)].isna(), '{}_{}'.format(level, locale)] # codes ids cols_es = list(df.columns[df.columns.str.contains('_es')]) cols_en = list(df.columns[df.columns.str.contains('_en')]) nltk.download('stopwords') stopwords_es = nltk.corpus.stopwords.words('spanish') df = format_text(df, cols_es, stopwords=stopwords_es) df = format_text(df, cols_en, stopwords=stop_words.ENGLISH_STOP_WORDS) for col in [ 'sector_id', 'subsector_id', 'industry_group_id', 'naics_industry_id', 'national_industry_id' ]: df[col] = df[col].astype(str) # when creating the industry dimension, the cms ask for members, so 'ghost' profiles are created # they also appear at the search bar which the canon-cms-warmup also gets query = 'SELECT distinct(national_industry_id) FROM inegi_economic_census' query_result = query_to_df(self.connector, raw_query=query) query_result = list(query_result['national_industry_id']) print('Total ids (dimension):', df.shape[0]) print('Ids in dimension but not in data:', df.loc[~df['national_industry_id'].isin(query_result)].shape[0]) print('Total ids (data):', df.loc[df['national_industry_id'].isin(query_result)].shape[0]) df = df.loc[df['national_industry_id'].isin(query_result)].copy() if params.get('is_dim'): df.drop_duplicates(subset=['national_industry_es'], inplace=True) return df
def run_step(self, prev_result, params): logger.info('Running Xform step...') df = prev_result df['month_id'] = df.Periodo.str.replace('/', '') df['trade_flow_id'] = df.name.apply( lambda x: 1 if 'Importaciones' in x.split() else 2) df['petroleum'] = df.name.apply(lambda x: 1 if 'Petroleras' in x.split() else 2) df['country_name'] = df.name.apply(lambda x: x.split(' > ')[-1]) dim_country = query_to_df(self.connector, 'select * from dim_shared_country') dim_country = dim_country[['country_name_es', 'iso3']] iso3_dict_items_list = list(iso3_dict.items()) df_temp = pd.DataFrame(iso3_dict_items_list, columns=['country_name_es', 'iso3']) dim_country = pd.concat([dim_country, df_temp]) for string in strings_to_remove: df.country_name = df.country_name.str.replace(string, '', 1) df = df[(~df.country_name.str.contains('total')) & ~df.country_name.str.contains('Total')] df = df.merge(dim_country[[ 'iso3', 'country_name_es' ]].drop_duplicates(subset=['country_name_es']), left_on='country_name', right_on='country_name_es', how='left') df = df[['month_id', 'iso3', 'trade_flow_id', 'petroleum', 'value']] df = df[df.value != 'C'] df.value = df.value.astype(float) df.month_id = df.month_id.astype(int) df.value = df.value.fillna(0) df.value = df.value * 1000 return df
def check_update(files, table): db_connector = Connector.fetch('clickhouse-database', open('../conns.yaml')) current_files = [] for level in ['ent', 'mun']: try: query = 'SELECT distinct(url) FROM {}{}'.format(table, level) temp = list(query_to_df(db_connector, raw_query=query)['url']) except: temp = [] current_files = current_files + temp if len(current_files) > 0: df = pd.DataFrame({'url': files}) return list(df.loc[~df['url'].isin(current_files), 'url']) else: return files
def run_step(self, prev, params): if int(params.get("year")) < 2020: df = pd.read_excel(prev, header=4) df.rename(columns={'Unnamed: 0': 'ent_id'}, inplace=True) else: df = pd.read_excel(prev, header=3) df.rename(columns={'Entidad federativa': 'ent_id'}, inplace=True) df.dropna(inplace=True) db_connector = Connector.fetch('clickhouse-database', open('../conns.yaml')) states = query_to_df(db_connector, raw_query='select ent_name, ent_id from dim_shared_geography_ent') df['ent_id'].replace(dict(zip(states['ent_name'], states['ent_id'])), inplace=True) df = df.melt(id_vars='ent_id', var_name='decile') df.replace({'Estados Unidos Mexicanos': 33}, inplace=True) df.replace({'México': 15}, inplace=True) df['value'] = df['value'].astype(int) df['year'] = params.get('year') df['ent_id'] = df['ent_id'].replace(33, 0) df['nat_id'] = df['ent_id'].astype(str).apply(lambda x: 'mex' if x == '0' else '0') df = df[['nat_id', 'ent_id', 'year', 'decile', 'value']].copy() deflactores = { "2016": 1.1919724122410000, "2018": 1.0715630294111653, "2020": 1 } # income values to 2020 currency (using enigh deflactors) df["value"] = df["value"] * deflactores[str(params.get("year"))] df["value"] = df["value"].astype(int) return df
def run_step(self, prev, params): df = prev replace_dict = { 'País de origen de los ingresos por remesas, ': '', 'País de destino de los egresos por remesas, ': '' } for key, val in replace_dict.items(): df['iso3'] = df['iso3'].apply(lambda x: x.replace(key, val)) df = df[~df['iso3'].str.contains('Total')] # time re ordering df['quarter_id'] = df['quarter_id'].str[-7:].str.replace('/', '') df['quarter_id'] = df['quarter_id'].str[-4:] + df['quarter_id'].str[:2] df['move'] = df['move'].replace(move_dict) query = 'SELECT country_name_es, iso3 FROM dim_shared_country' geo_country = query_to_df(self.connector, raw_query=query) # replacing country names with ids country_dict = dict( zip(geo_country['country_name_es'], geo_country['iso3'])) df['iso3'] = df['iso3'].replace(country_dict).replace(wrong_countries) # quarters fix df['quarter_id'] = df['quarter_id'].str[0:4] + df['quarter_id'].str[ 4:].replace(trimester_map) for col in ['quarter_id', 'move']: df[col] = df[col].astype(int) # Amounts from Millions of USD to USD df['remittance_amount'] = df['remittance_amount'] * 1000000 df['remittance_amount'] = df['remittance_amount'].apply( lambda x: round(x, 2)) return df
def check_update(files, table): """This function check's all the url's (files path in GCP) in clickhouse and compares with the files in GCP. The function returns a list of all new url's (new data), or all the url's because there's no data ingested in clickhouse.""" db_connector = Connector.fetch('clickhouse-database', open('../conns.yaml')) current_files = [] for level in ['ent', 'mun']: try: query = 'SELECT distinct(url) FROM {}{}'.format(table, level) temp = list(query_to_df(db_connector, raw_query=query)['url']) except: temp = [] current_files = current_files + temp if len(current_files) > 0: df = pd.DataFrame({'url': files}) return list(df.loc[~df['url'].isin(current_files), 'url']) else: return files
def run_step(self, prev, params): df = prev df = df.melt(id_vars=['título', 'trade_flow_id'], var_name='month_id', value_name='trade_value') df = df.rename(columns={'título': 'iso3'}) # month re format df['month_id'] = df['month_id'].str.replace('/', '').str[2:] df['month_id'] = df['month_id'].str[2:] + df['month_id'].str[:2] # replacing country names with ids df['iso3'] = df['iso3'].str.split(',').str[-1].str[1:] df = df[~df['iso3'].str.contains('Total')] query = 'SELECT country_name_es, iso3, continent_es FROM dim_shared_country' geo_country = query_to_df(self.connector, raw_query=query) continent_list = list(geo_country['continent_es']) country_dict = dict( zip(geo_country['country_name_es'], geo_country['iso3'])) df = df[~df['iso3'].isin(continent_list + not_countries)] df['iso3'] = df['iso3'].replace(country_dict).replace(wrong_countries) # thousand usd to usd df['trade_value'] = df['trade_value'] * 1000 for col in ['trade_flow_id', 'month_id', 'trade_value']: df[col] = df[col].astype(int) df = df.groupby(['iso3', 'trade_flow_id', 'month_id']).sum().reset_index() return df
def run_step(self, prev, params): df, raw_to_careers, careers_to_id, raw_to_institution, institution_to_id = prev # type format for col in [ 'ent_id', 'mun_id', 'career', 'type', 'campus_id', 'institution_id', 'program' ]: df[col] = df[col].ffill() df.ent_id = df.ent_id.str.title() # ids replace from external table df.ent_id.replace(ORIGIN, inplace=True) # totals clean df.career = df.career.astype('str') for col in ['mun_id', 'career', 'type']: print('Current col:', col) df[col] = df[col].astype(str) df = df.loc[df[col].str.contains('Total') == False].copy() df[col] = df[col].str.strip().str.replace(' ', ' ').str.replace( ':', '') df.career = df.career.str.replace('.', '').astype('int') df['program'] = df['program'].astype(str).str.strip().str.upper() # municipality level id try: df.loc[:, 'mun_id'] = df.loc[:, 'ent_id'].astype(float).astype(int).astype(str) + \ df.loc[:, 'mun_id'].astype(float).astype(int).astype(str).str.zfill(3) except Exception as e: print('Municipality level id:', e) query = 'SELECT mun_id, mun_name FROM dim_shared_geography_mun' geo_mun = query_to_df( self.connector, raw_query=query).drop_duplicates(subset=['mun_id']) geo_mun['mun_name'] = geo_mun['mun_name'].str.upper() df['mun_id'].replace(dict( zip(geo_mun['mun_name'], geo_mun['mun_id'])), inplace=True) df.drop(columns=['ent_id'], inplace=True) # column names format df.columns = df.columns.str.replace('suma de ', '').str.replace('pni-', '') # 2020 rename measures if params.get('custom'): df.columns = [ '{}-{}-{}'.format('mat', x.split('-')[2], x.split('-')[1]) if len(x.split('-')) == 3 else x for x in df.columns ] # exceptions: age_range_replace = { 'm-30-34-h': 'mat-h-30-34', 'm-30-34-m': 'mat-m-30-34', 'm-35-39-h': 'mat-h-35-39', 'm-35-39-m': 'mat-m-35-39' } df.rename(columns=age_range_replace, inplace=True) # melt step age_range = [x for x in df.columns if 'mat-' in x] df = df[[ 'mun_id', 'career', 'type', 'campus_id', 'program', 'institution_id' ] + age_range].copy() df.columns = df.columns.str.replace('mat-', '') df = df.melt(id_vars=[ 'mun_id', 'career', 'type', 'campus_id', 'program', 'institution_id' ], var_name='sex', value_name='value') df = df.loc[df.value != 0] split = df['sex'].str.split('-', n=1, expand=True) df['sex'] = split[0] df['age'] = split[1] df.sex.replace(SEX, inplace=True) df.type = df.type.replace(TYPES).replace(type_fix) df.age.replace(AGE_RANGE, inplace=True) # stopwords es nltk.download('stopwords') # career_id replace # Raw rame -> processed name careers = pd.read_csv(raw_to_careers) if params.get('year_plus') == 2021: careers['backup'] = careers['backup'].str.upper().apply( lambda x: unidecode.unidecode(x)) df.loc[df['program'].isin(['#NAME?', '#¿NOMBRE?']), 'program'] = '0' df = df.loc[~df['program'].str.upper().str.contains('TOTAL')].copy() missing_ids = df.loc[( ~df['program'].isin(list(careers['backup']) + ['0']))].shape[0] print('Missing ids (careers):', missing_ids) if missing_ids > 0: print(df.loc[~df['program'].isin(list(careers['backup']))]) df.loc[~df['program'].isin(list(careers['backup']))].to_csv( 'review_career.csv', index=False) df.program = df.program.map(dict(zip(careers.backup, careers.name_es))) # Processed name -> numerical id careers = pd.read_csv(careers_to_id) df.program = df.program.map( dict(zip(careers.backup, careers.career_id))) df.program = df.program.fillna(0) # campus_id replace for col in ['institution_id', 'campus_id']: df[col] = df[col].astype(str).str.strip() df['backup'] = (df['institution_id'] + df['campus_id']).str.strip() # Raw rame -> processed name raw_institution = pd.read_csv(raw_to_institution) # normalization df['backup'] = df['backup'].str.upper() raw_institution['backup'] = raw_institution['backup'].str.upper() df = df.loc[~df['backup'].str.upper().str.contains('TOTAL')].copy() missing_ids = df.loc[~df['backup'].isin(list(raw_institution['backup']) )]['backup'].unique() print('Missing ids (institutions):', len(missing_ids)) if len(missing_ids) > 0: print(df.loc[~df['backup'].isin(list(raw_institution['backup']))]) df.loc[~df['backup'].isin(list(raw_institution['backup']))].to_csv( 'review_campus.csv', index=False) df.backup = df.backup.replace( dict(zip(raw_institution.backup, raw_institution.backup_raw))) institutions = pd.read_csv(institution_to_id) df['backup'] = df['backup'].str.upper() institutions['backup_raw'] = institutions['backup_raw'].str.upper() df.backup = df.backup.replace( dict(zip(institutions.backup_raw, institutions.campus_id))) # Processed name -> numerical id df['campus_id'] = df['backup'].copy() # replacing missing muns df['mun_id'] = df['mun_id'].replace(mun_fix) df['value'] = df['value'].replace('--', 0).fillna(0) for col in [ 'mun_id', 'career', 'program', 'type', 'sex', 'value', 'age', 'campus_id' ]: df[col] = df[col].astype('int') df = df[[ 'mun_id', 'campus_id', 'program', 'type', 'sex', 'value', 'age' ]].copy() df['year'] = int(params.get('year_plus')) df.dropna(subset=['value'], inplace=True) return df
def run_step(self, prev, params): gdp_nat, gdp_ent = prev[0], prev[1] # nation trimester gdp cleaning gdp_nation = pd.read_csv(gdp_nat) gdp_nation.columns = gdp_nation.columns.str.strip().str.lower( ).str.replace(' ', '_').str.replace(' ', '_') gdp_nation = gdp_nation.melt(id_vars=['concepto'], var_name='year', value_name='gdp_nat') gdp_nation = gdp_nation[gdp_nation['concepto'] == 'a B.1bV - Valor agregado bruto'].reset_index( drop=True)[['year', 'gdp_nat']].copy() gdp_nation['year'] = gdp_nation['year'].apply( lambda x: x.replace('undefined_', '').replace('_|_', '').replace( '_p', '').replace('_r', '')) gdp_nation = gdp_nation[gdp_nation['year'].str[-5:] == 'anual'] # splitting trimester by and replacing t chars gdp_nation['year'] = gdp_nation['year'].str[:4] gdp_nation = gdp_nation.groupby(['year']).sum().reset_index() # state trimester gdp cleaning gdp_ent = pd.read_csv(gdp_ent) gdp_ent.columns = gdp_ent.columns.str.strip().str.lower().str.replace( ' ', '_') gdp_ent = gdp_ent.iloc[6:-1].iloc[186:218, :].reset_index(drop=True) gdp_ent = gdp_ent.melt( id_vars='concepto', var_name='year', value_name='gdp_ent').rename(columns={'concepto': 'ent_id'}) gdp_ent['year'] = gdp_ent['year'].str[:4] # querying the database to get entities id's query = 'SELECT ent_id, ent_name FROM dim_shared_geography_ent' geo_ent = query_to_df(self.connector, raw_query=query).drop_duplicates('ent_id') ent_dict = dict(zip(geo_ent['ent_name'], geo_ent['ent_id'])) gdp_ent['ent_id'] = gdp_ent['ent_id'].replace( 'México', 'Estado de México').replace(ent_dict) # filling missing years in gdp_ent df for year in ['2020', '2021']: df_temp = pd.DataFrame(data={ 'ent_id': gdp_ent.ent_id.unique(), 'year': year, 'gdp_ent': np.nan }) gdp_ent = pd.concat([gdp_ent, df_temp], ignore_index=True).reset_index(drop=True) # cleaning (we won't use gdp_nation) gdp_nation = gdp_nation[ gdp_nation['year'].astype(int) >= 2003].reset_index( drop=True).copy() gdp_ent = gdp_ent[~gdp_ent['gdp_ent'].isna()] gdp_ent['year'] = gdp_ent['year'].astype(int) gdp_ent['gdp_ent'] = gdp_ent['gdp_ent'].apply(lambda x: round(x, 2)) return gdp_ent
# drop table clean_tables('gobmx_covid') # ingest data pp = CovidPipeline() data = sorted(glob.glob('*.csv')) for chunk in pd.read_csv(data[-1], encoding='latin-1', iterator=True, chunksize=10**5): pp.run({'chunk': chunk}) # update time dimension db_connector = Connector.fetch('clickhouse-database', open('../conns.yaml')) min_query = 'SELECT min(ingress_date), min(symptoms_date), min(updated_date) FROM gobmx_covid' max_query = 'SELECT max(ingress_date), max(symptoms_date), max(updated_date) FROM gobmx_covid' min_value = str( min(query_to_df(db_connector, raw_query=min_query).iloc[0].to_list())) max_value = str( max(query_to_df(db_connector, raw_query=max_query).iloc[0].to_list())) min_time = '{}-{}-{}'.format(min_value[:4], min_value[4:6], min_value[6:]) max_time = '{}-{}-{}'.format(max_value[:4], max_value[4:6], max_value[6:]) pp = DimTimeDatePipeline() pp.run({'init': min_time, 'end': max_time}) # covid stats dim time table pp = DimTimeDateCovidStatsPipeline() pp.run({'init': min_time, 'end': max_time}) print('Duration: {}'.format(datetime.now() - start_time))
def run_step(self, prev, params): df, raw_to_careers, careers_to_id, raw_to_institution, institution_to_id = prev # type format for col in ['ent_id', 'mun_id', 'career', 'type', 'campus_id', 'institution_id', 'program']: df[col] = df[col].ffill() df.ent_id = df.ent_id.str.title() # ids replace from external table df.ent_id.replace(ORIGIN, inplace=True) # totals clean df.career = df.career.astype('str') for col in ['mun_id', 'career', 'type']: print('Current col:', col) df[col] = df[col].astype(str) df = df.loc[df[col].str.contains('Total') == False].copy() df[col] = df[col].str.strip().str.replace(' ', ' ').str.replace(':', '') df.career = df.career.str.replace('.', '').astype('int') df['program'] = df['program'].astype(str).str.strip().str.upper() # municipality level id try: df.loc[:, 'mun_id'] = df.loc[:, 'ent_id'].astype(float).astype(int).astype(str) + \ df.loc[:, 'mun_id'].astype(float).astype(int).astype(str).str.zfill(3) except Exception as e: print('Municipality level id:', e) query = 'SELECT mun_id, mun_name FROM dim_shared_geography_mun' geo_mun = query_to_df(self.connector, raw_query=query).drop_duplicates(subset=['mun_id']) geo_mun['mun_name'] = geo_mun['mun_name'].str.upper() df['mun_id'].replace(dict(zip(geo_mun['mun_name'], geo_mun['mun_id'])), inplace=True) df.drop(columns=['ent_id'], inplace=True) # column names format df.columns = df.columns.str.replace('suma de ', '') # melt step df = df[['mun_id', 'career', 'type', 'campus_id', 'program', 'institution_id', 'pni-agu', 'pni-bc', 'pni-bcs', 'pni-cam', 'pni-coa', 'pni-col', 'pni-chia', 'pni-chih', 'pni-df', 'pni-dur', 'pni-gua', 'pni-gue', 'pni-hid', 'pni-jal', 'pni-mex', 'pni-mic', 'pni-mor', 'pni-nay', 'pni-nl', 'pni-oax', 'pni-pue', 'pni-que', 'pni-qr', 'pni-slp', 'pni-sin', 'pni-son', 'pni-tab', 'pni-tam', 'pni-tla', 'pni-ver', 'pni-yuc', 'pni-zac']].copy() # column names format df.columns = df.columns.str.replace('pni-', '') df = df.melt(id_vars=['mun_id', 'career', 'type', 'campus_id', 'program', 'institution_id'], var_name='origin', value_name='value') df = df.loc[df.value != 0] # external ids transfomation df['origin'].replace(ENT_ORIGIN, inplace=True) # encoding df.type.replace(TYPES, inplace=True) # stopwords es nltk.download('stopwords') # career_id replace # Raw rame -> processed name careers = pd.read_csv(raw_to_careers) df.loc[df['program'].isin(['#NAME?', '#¿NOMBRE?']), 'program'] = '0' df = df.loc[~df['program'].str.upper().str.contains('TOTAL')].copy() missing_ids = df.loc[(~df['program'].isin(list(careers['backup']) + ['0']))].shape[0] print('Missing ids (careers):', missing_ids) if missing_ids > 0: print(df.loc[~df['program'].isin(list(careers['backup']))]) df.loc[~df['program'].isin(list(careers['backup']))].to_csv('review_career.csv', index=False) df.program = df.program.replace(dict(zip(careers.backup, careers.name_es))) # Processed name -> numerical id careers = pd.read_csv(careers_to_id) df.program = df.program.replace(dict(zip(careers.backup, careers.career_id))) # campus_id replace for col in ['institution_id', 'campus_id']: df[col] = df[col].astype(str).str.strip() df['backup'] = (df['institution_id'] + df['campus_id']).str.strip() # Raw rame -> processed name institution = pd.read_csv(raw_to_institution) df = df.loc[~df['backup'].str.upper().str.contains('TOTAL')].copy() missing_ids = df.loc[~df['backup'].isin(list(institution['backup']))].shape[0] print('Missing ids (institutions):', missing_ids) if missing_ids > 0: print(df.loc[~df['backup'].isin(list(institution['backup']))]) df.loc[~df['backup'].isin(list(institution['backup']))].to_csv('review_campus.csv', index=False) df.backup = df.backup.replace(dict(zip(institution.backup, institution.backup_raw))) # Processed name -> numerical id institution = pd.read_csv(institution_to_id) df.backup = df.backup.replace(dict(zip(institution.backup_raw, institution.campus_id))) df['campus_id'] = df['backup'] for col in ['mun_id', 'program', 'type', 'origin', 'value', 'campus_id']: print('Current col:', col) df[col] = df[col].astype('float') df = df[['mun_id', 'campus_id', 'program', 'type', 'origin', 'value']].copy() df['year'] = int(params.get('year_plus')) return df
def run_step(self, prev, params): BASE_URL = os.environ.get('BASE_URL') #Data from each report def _report(url): headers = { "Cache-Control": "no-cache", "Pragma": "no-cache" } r = requests.get(url, headers=headers) data_json = r.json() report = pd.DataFrame(data_json["data"]) report = report[report["Municipality"] != "No Informado"] report = report.drop(columns={"Updated Date", "Municipality"}) report = report.rename(columns={"Updated Date ID": "time_id", "Municipality ID":"mun_id"}) df_temp = [] for a, df_a in report.groupby("mun_id"): _df = df_a.copy() _df = _df.sort_values("time_id") _df["reported_cases"] = _df["Cases"].diff().fillna(0).astype(int) df_temp.append(_df) df_temp = pd.concat(df_temp, sort=False) return df_temp report_cases = _report(BASE_URL + "/data.jsonrecords?Covid+Result=1&cube=gobmx_covid&drilldowns=Updated+Date%2CMunicipality&measures=Cases&parents=false&sparse=false") report_cases = report_cases.rename(columns={"reported_cases":"new_cases_report", "Cases":"accum_cases_report"}) report_deaths = _report(BASE_URL + "/data.jsonrecords?Covid+Result=1&Is+Dead=1&cube=gobmx_covid&drilldowns=Updated+Date%2CMunicipality&measures=Cases&parents=false&sparse=false") report_deaths = report_deaths.rename(columns={"reported_cases":"new_deaths_report", "Cases":"accum_deaths_report"}) report_hospitalized = _report(BASE_URL + "/data.jsonrecords?Covid+Result=1&Patient+Type=2&cube=gobmx_covid&drilldowns=Updated+Date%2CMunicipality&measures=Cases&parents=false&sparse=false") report_hospitalized = report_hospitalized.rename(columns={"reported_cases":"new_hospitalized_report", "Cases":"accum_hospitalized_report"}) report_suspect = _report(BASE_URL + "/data.jsonrecords?Covid+Result=3&cube=gobmx_covid&drilldowns=Updated+Date%2CMunicipality&measures=Cases&parents=false&sparse=false") report_suspect = report_suspect.rename(columns={"reported_cases":"new_suspect_report", "Cases":"accum_suspect_report"}) report = pd.merge(report_cases, report_deaths, how="outer", on=["time_id", "mun_id"]) report = pd.merge(report, report_hospitalized, how="outer", on=["time_id", "mun_id"]) report = pd.merge(report, report_suspect, how="outer", on=["time_id", "mun_id"]) #Create dictionary for state and population r = requests.get(BASE_URL + "/data?Year=2020&cube=population_projection&drilldowns=Municipality&measures=Projected+Population&parents=false&sparse=false") data_json = r.json() states_data = pd.DataFrame(data_json["data"]) dicto_mun_population = dict(zip(states_data["Municipality ID"], states_data["Projected Population"])) #Data from latest report data = sorted(glob.glob("*.csv")) df = pd.read_csv(data[-1], encoding="latin-1") df.columns = [x.strip().lower().replace(" ", "_") for x in df.columns] df["entidad_res"] = df["entidad_res"].astype(str).str.zfill(2) df["municipio_res"] = df["municipio_res"].astype(str).str.zfill(3) df["mun_id"] = (df["entidad_res"] + df["municipio_res"]).astype(int) # ids refactor df.loc[df['clasificacion_final'].isin([1,2,3]), 'resultado_lab'] = 1 df.loc[df['clasificacion_final'].isin([4,5,6]), 'resultado_lab'] = 3 df.loc[df['clasificacion_final'] == 7, 'resultado_lab'] = 2 #Suspect cases df_suspect = df[["mun_id", "fecha_ingreso", "resultado_lab"]] df_suspect = df_suspect[df_suspect["resultado_lab"] == 3] df_suspect["resultado_lab"] = df_suspect["resultado_lab"].replace(3,1) df_suspect = df_suspect.rename(columns={"fecha_ingreso":"time_id", "resultado_lab":"daily_suspect"}) df_suspect = df_suspect.groupby(["mun_id","time_id"]).sum().reset_index() #Hospitalized df_hosp = df[["mun_id", "fecha_ingreso", "resultado_lab", "tipo_paciente"]] df_hosp = df_hosp[(df_hosp["resultado_lab"] == 1) & (df_hosp["tipo_paciente"] == 2)] df_hosp = df_hosp.drop(columns="tipo_paciente") df_hosp = df_hosp.rename(columns={"fecha_ingreso":"time_id", "resultado_lab":"daily_hospitalized"}) df_hosp = df_hosp.groupby(["mun_id","time_id"]).sum().reset_index() #Cases df1 = df[["mun_id", "fecha_ingreso", "resultado_lab"]] df1 = df1[df1["resultado_lab"] == 1] df1 = df1.rename(columns={"fecha_ingreso":"time_id", "resultado_lab":"daily_cases"}) df1 = df1.groupby(["mun_id","time_id"]).sum().reset_index() df1 = pd.merge(df1, df_hosp, how="outer", on=["mun_id","time_id"]) df1 = pd.merge(df1, df_suspect, how="outer", on=["mun_id","time_id"]) df1["time_id"] = pd.to_datetime(df1["time_id"]) # Replace unknown municipalities mun = query_to_df(self.connector, "select mun_id from dim_shared_geography_mun") df1.loc[~df1["mun_id"].isin(mun['mun_id']), "mun_id"] = 33000 df1 = df1.groupby(["mun_id", "time_id"]).sum().reset_index() df1["time_id"] = pd.to_datetime(df1["time_id"]) # Add missing dates daily cases df1_ = [] last_day = df1["time_id"].max() for a, df_a in df1.groupby("mun_id"): first_day = df_a["time_id"].min() idx = pd.date_range(first_day, last_day) df_b = df_a.reindex(idx) df_b = pd.DataFrame(df_b.index) df_b = df_b.rename(columns={0: "time_id"}) result = pd.merge(df_b, df_a, how="outer", on="time_id") df1_.append(result) df1_ = pd.concat(df1_, sort=False) df1_["mun_id"] = df1_["mun_id"].fillna(method="ffill") #Deaths # fix max death date max_current_date = df['fecha_ingreso'].max() df.loc[df['fecha_def'] > max_current_date, 'fecha_def'] = "9999-99-99" df2 = df[["mun_id", "fecha_ingreso", "fecha_def", "resultado_lab"]] df2 = df2.rename(columns={"fecha_ingreso": "ingress_date", "fecha_def": "death_date", "resultado_lab": "daily_deaths"}) df2 = df2[df2["daily_deaths"] == 1] df2 = df2[df2["death_date"]!= "9999-99-99"] for i in ["ingress_date", "death_date"]: df2[i] = pd.to_datetime(df2[i]) df2["days_between_ingress_and_death"] = df2["death_date"] - df2["ingress_date"] df2["days_between_ingress_and_death"] = df2["days_between_ingress_and_death"].dt.days df2 = df2.drop(columns="ingress_date") df2 = df2.rename(columns={"death_date":"time_id"}) df2 = df2.groupby(["mun_id", "time_id"]).agg({"daily_deaths": "sum", "days_between_ingress_and_death": "mean"}).reset_index() ##Add missing dates deaths df2_ = [] last_day = df2["time_id"].max() for a, df_a in df2.groupby("mun_id"): first_day = df_a["time_id"].min() idx = pd.date_range(first_day, last_day) df_b = df_a.reindex(idx) df_b = pd.DataFrame(df_b.index) df_b = df_b.rename(columns={0:"time_id"}) result = pd.merge(df_b, df_a, how="outer", on="time_id") df2_.append(result) df2_ = pd.concat(df2_, sort=False) df2_["mun_id"] = df2_["mun_id"].fillna(method="ffill") #Merge daily cases and deaths data = pd.merge(df1_, df2_, how="outer", on=["time_id", "mun_id"]) data = data.sort_values(by=["time_id", "mun_id"]) data["time_id"] = data["time_id"].astype(str).str.replace("-", "").astype(int) #Merge data latest report with data from each report data = pd.merge(data, report, how="outer", on=["time_id", "mun_id"]) for col in ["daily_cases", "daily_hospitalized", "daily_suspect", "daily_deaths", "accum_cases_report", "new_cases_report", "accum_deaths_report", "new_deaths_report", "accum_hospitalized_report", "new_hospitalized_report", "accum_suspect_report", "new_suspect_report"]: data[col] = data[col].fillna(0).astype(int) #Add column of accumulated cases df_final = [] for a, df_a in data.groupby("mun_id"): # create temporal df to silence "SettingWithCopyWarning" _df = df_a.copy() _df["accum_cases"] = _df["daily_cases"].cumsum() _df["accum_deaths"] = _df["daily_deaths"].cumsum() _df["accum_hospitalized"] = _df.daily_hospitalized.cumsum() _df["accum_suspect"] = _df.daily_suspect.cumsum() for i in ["daily_cases", "accum_cases", "daily_deaths", "accum_deaths", "accum_cases_report", "new_cases_report", "accum_deaths_report", "new_deaths_report"]: measure = "avg7_{}".format(i) _df[measure] = _df[i].rolling(7).mean() for j in ["daily_cases", "accum_cases", "daily_deaths", "accum_deaths", "accum_cases_report", "new_cases_report", "accum_deaths_report", "new_deaths_report"]: measure = "sum_last7_{}".format(j) _df[measure] = _df[j].rolling(7).sum() df_final.append(_df) df_final = pd.concat(df_final, sort=False) for k in ["sum_last7_daily_cases", "sum_last7_accum_cases", "sum_last7_daily_deaths", "sum_last7_accum_deaths", "sum_last7_accum_cases_report", "sum_last7_new_cases_report", "sum_last7_accum_deaths_report", "sum_last7_new_deaths_report"]: df_final[k] = df_final[k].fillna(0).astype(int) #Rate per 100.000 inhabitans df_final["population_temp"] = df_final["mun_id"].replace(dicto_mun_population) df_final["population"] = np.where(df_final["population_temp"] != df_final["mun_id"], df_final["population_temp"], np.nan) df_final["rate_daily_cases"] = (df_final["daily_cases"] / df_final["population"]) * 100000 df_final["rate_accum_cases"] = (df_final["accum_cases"] / df_final["population"]) * 100000 df_final["rate_new_cases_report"] = (df_final["new_cases_report"] / df_final["population"]) * 100000 df_final["rate_accum_cases_report"] = (df_final["accum_cases_report"] / df_final["population"]) * 100000 df_final["rate_daily_deaths"] = (df_final["daily_deaths"] / df_final["population"]) * 100000 df_final["rate_accum_deaths"] = (df_final["accum_deaths"] / df_final["population"]) * 100000 df_final["rate_new_deaths_report"] = (df_final["new_cases_report"] / df_final["population"]) * 100000 df_final["rate_accum_deaths_report"] = (df_final["accum_deaths_report"] / df_final["population"]) * 100000 df_final = df_final.drop(columns={"population", "population_temp"}) #Add a column with days, being day one when at least 50 cases accumulate day = [] for a, df_a in df_final.groupby("mun_id"): default = 0 for row in df_a.iterrows(): if row[1]["accum_cases"] >= 50: default = default + 1 day.append(default) df_final["day_from_50_cases"] = day #Add a column with days, being day one when at least 10 deaths accumulate day = [] for a, df_a in df_final.groupby("mun_id"): default = 0 for row in df_a.iterrows(): if row[1]["accum_deaths"] >= 10: default = default + 1 day.append(default) df_final["day_from_10_deaths"] = day df_final["mun_id"] = df_final["mun_id"].astype(int) print(datetime.now(), df_final["time_id"].max()) return df_final
def run_step(self, prev, params): df, raw_to_careers, careers_to_id, raw_to_institution, institution_to_id = prev # type format for col in [ 'ent_id', 'mun_id', 'career', 'type', 'campus_id', 'institution_id', 'program' ]: df[col] = df[col].ffill() df.ent_id = df.ent_id.str.title() # ids replace from external table df.ent_id.replace(ORIGIN, inplace=True) # totals clean df.career = df.career.astype('str') for col in ['mun_id', 'career', 'type']: print('Current col:', col) df[col] = df[col].astype(str) df = df.loc[df[col].str.contains('Total') == False].copy() df[col] = df[col].str.strip().str.replace(' ', ' ').str.replace( ':', '') df.career = df.career.str.replace('.', '').astype('int') df['program'] = df['program'].astype(str).str.strip().str.upper() # municipality level id try: df.loc[:, 'mun_id'] = df.loc[:, 'ent_id'].astype(float).astype(int).astype(str) + \ df.loc[:, 'mun_id'].astype(float).astype(int).astype(str).str.zfill(3) except Exception as e: print('Municipality level id:', e) query = 'SELECT mun_id, mun_name FROM dim_shared_geography_mun' geo_mun = query_to_df( self.connector, raw_query=query).drop_duplicates(subset=['mun_id']) geo_mun['mun_name'] = geo_mun['mun_name'].str.upper() df['mun_id'].replace(dict( zip(geo_mun['mun_name'], geo_mun['mun_id'])), inplace=True) df.drop(columns=['ent_id'], inplace=True) # column names format df.columns = df.columns.str.replace('suma de ', '') # melt step try: df = df[[ 'mun_id', 'career', 'type', 'campus_id', 'institution_id', 'program', 'e-h', 'e-m', 'g-h', 'g-m' ]].copy() except: df = df[[ 'mun_id', 'career', 'type', 'campus_id', 'institution_id', 'program', 'e-h', 'e-m', 't-h', 't-m' ]].copy() df = df.melt(id_vars=[ 'mun_id', 'career', 'type', 'campus_id', 'institution_id', 'program' ], var_name='stat', value_name='value') df = df.loc[df.value != 0] split = df['stat'].str.split('-', n=1, expand=True) df['stat'] = split[0] df['sex'] = split[1] # encoding df.stat.replace(STAT, inplace=True) df.sex.replace(SEX, inplace=True) df.type.replace(TYPES, inplace=True) # 2020 version status fix df.loc[(df['type'].isin([12, 13, 14])) & (df['stat'] == 3), 'stat'] = 2 # stopwords es nltk.download('stopwords') # career_id replace # Raw rame -> processed name careers = pd.read_csv(raw_to_careers) df.loc[df['program'].isin(['#NAME?', '#¿NOMBRE?']), 'program'] = '0' df = df.loc[~df['program'].str.upper().str.contains('TOTAL')].copy() missing_ids = df.loc[( ~df['program'].isin(list(careers['backup']) + ['0']))].shape[0] print('Missing ids (careers):', missing_ids) if missing_ids > 0: print(df.loc[~df['program'].isin(list(careers['backup']))]) df.loc[~df['program'].isin(list(careers['backup']))].to_csv( 'review_career.csv', index=False) df.program = df.program.replace( dict(zip(careers.backup, careers.name_es))) # Processed name -> numerical id careers = pd.read_csv(careers_to_id) df.program = df.program.replace( dict(zip(careers.backup, careers.career_id))) # campus_id replace for col in ['institution_id', 'campus_id']: df[col] = df[col].astype(str).str.strip() df['backup'] = (df['institution_id'] + df['campus_id']).str.strip() # Raw rame -> processed name institution = pd.read_csv(raw_to_institution) df = df.loc[~df['backup'].str.upper().str.contains('TOTAL')].copy() missing_ids = df.loc[~df['backup'].isin(list(institution['backup']) )].shape[0] print('Missing ids (institutions):', missing_ids) if missing_ids > 0: print(df.loc[~df['backup'].isin(list(institution['backup']))]) df.loc[~df['backup'].isin(list(institution['backup']))].to_csv( 'review_campus.csv', index=False) df.backup = df.backup.replace( dict(zip(institution.backup, institution.backup_raw))) # Processed name -> numerical id institution = pd.read_csv(institution_to_id) df.backup = df.backup.replace( dict(zip(institution.backup_raw, institution.campus_id))) df['campus_id'] = df['backup'] for col in [ 'mun_id', 'career', 'program', 'type', 'sex', 'value', 'stat', 'campus_id' ]: print('Current col:', col) df[col] = df[col].astype('float') df = df[[ 'mun_id', 'campus_id', 'program', 'type', 'sex', 'value', 'stat' ]].copy() df['year'] = int(params.get('year_plus')) return df
import pandas as pd from tabulate import tabulate from bamboo_lib.helpers import grab_connector, query_to_df db_connector = grab_connector(__file__, "clickhouse-remote") # Get table names names_df = query_to_df(db_connector, "SHOW TABLES") tables = names_df["name"].tolist() # Print row count for t in [table for table in tables if "acs" in table]: print("\nTABLE: {}".format(t)) query = "SELECT year, COUNT(*) FROM {} GROUP BY year ORDER BY year;".format( t) df = query_to_df(db_connector, query, col_headers=["year", "count"]) new_df = pd.DataFrame({ "year": df["year"].tolist(), "count": ["{:,}".format(int(c)) for c in df["count"]] }) print(tabulate(new_df, headers="keys", tablefmt="psql", showindex=False))