def integration_procces(**kwargs): db_connection = MySqlHook('airflow_db').get_sqlalchemy_engine() with db_connection.begin() as transaction: df_C = pd.read_sql_table("confirmed", con=transaction, schema='covid') df_R = pd.read_sql_table("recovered", con=transaction, schema='covid') df_D = pd.read_sql_table("deaths", con=transaction, schema='covid') df_C = df_C.drop(columns=['id']) df_R = df_R.drop(columns=['id', 'lat', 'long']) df_D = df_D.drop(columns=['id', 'lat', 'long']) df_C['province_state'] = df_C['province_state'].fillna('') df_R['province_state'] = df_R['province_state'].fillna('') df_D['province_state'] = df_D['province_state'].fillna('') df_C = df_C.rename(columns=COLUMNS_C) df_R = df_R.rename(columns=COLUMNS_R) df_D = df_D.rename(columns=COLUMNS_D) df = pd.merge(df_C, df_R, on=['country_region', 'province_state', 'event_date']) df = pd.merge(df, df_D, on=['country_region', 'province_state', 'event_date']) df['mortality_rate'] = df['d_cases'] / df['c_cases'] df['recovery_rate'] = df['r_cases'] / df['c_cases'] #df_final = df[COLUMNS_VIEW] df_final = df with db_connection.begin() as transaction: transaction.execute("DELETE FROM covid.cases_data WHERE 1=1") df_final.to_sql("cases_data", con=transaction, schema="covid", if_exists="append", index=False)
def etl_process(**kwargs): logger.info(kwargs["execution_date"]) file_path = FSHook(FILE_CONNECTION_NAME).get_path() filename = 'time_series_covid19_deaths_global.csv' mysql_connection = MySqlHook( mysql_conn_id=CONNECTION_DB_NAME).get_sqlalchemy_engine() full_path = f'{file_path}/{filename}' logger.info(full_path) df = pd.read_csv(full_path) df = pd.melt(df, id_vars=['Lat', 'Long', 'Province/State', 'Country/Region'], var_name="RegDate", value_name="Count") df = df[df["Count"] > 0] df = df.rename(columns={ 'Province/State': 'State', 'Country/Region': 'Country' }) df['RegDate'] = pd.to_datetime(df['RegDate']) df['Type'] = 'D' with mysql_connection.begin() as connection: connection.execute("DELETE FROM Covid.Cases WHERE Type='D'") df.to_sql('Cases', con=connection, schema='Covid', if_exists='append', index=False) os.remove(full_path) logger.info(f"Rows inserted confirmed {len(df.index)}")
def etl_process(**kwargs): logger.info(kwargs["execution_date"]) file_path = FSHook(FILE_CONNECTION_NAME).get_path() filename = 'time_series_covid19_recovered_global.csv' mysql_connection = MySqlHook(mysql_conn_id=CONNECTION_DB_NAME).get_sqlalchemy_engine() full_path = f'{file_path}/{filename}' recovered = pd.read_csv(full_path, encoding = "ISO-8859-1").rename(columns= {'Lat': 'lat', 'Long': 'lon'}) recovered['lat'] = recovered.lat.astype(str) recovered['lon'] = recovered.lon.astype(str) variables = [ "Province/State", "Country/Region", "lat", "lon" ] new_recovered = pd.melt(frame=recovered, id_vars= variables, var_name="fecha",value_name="recovered") new_recovered["recovered"] = new_recovered["recovered"].astype(int) with mysql_connection.begin() as connection: connection.execute("DELETE FROM airflowcovid.recoverd WHERE 1=1") new_recovered.rename(columns=COLUMNS).to_sql('recoverd', con=connection, schema='airflowcovid', if_exists='append', index=False) os.remove(full_path) logger.info(f"Rows inserted into recoverd table in Mysql")
def insert_process(**kwargs): ti = kwargs['ti'] source_file = ti.xcom_pull(task_ids='transform_process') db_connection = MySqlHook('airflow_db').get_sqlalchemy_engine() df = pd.read_csv(source_file) with db_connection.begin() as transaction: transaction.execute("DELETE FROM covid.confirmed WHERE 1=1") df.to_sql("confirmed", con=transaction, schema="covid", if_exists="append", index=False) os.remove(source_file)
def etl_process(**kwargs): logger.info(kwargs["execution_date"]) mysql_connection = MySqlHook(mysql_conn_id=CONNECTION_DB_NAME).get_sqlalchemy_engine() df = pd.read_sql(QUERY, con=mysql_connection, coerce_float=False) with mysql_connection.begin() as connection: connection.execute("DELETE FROM test.consolidate_sales WHERE 1=1") df.to_sql("consolidate_sales", con=connection, schema="test", if_exists="append", index=False) logger.info(f"Rows inserted {len(df.index)}")
def insert_process(**kwargs): ti = kwargs['ti'] source_csv = ti.xcom_pull(task_ids='transform_process') source_df = pd.read_csv(source_csv) db_connection = MySqlHook('db_proyecto').get_sqlalchemy_engine() with db_connection.begin() as transaction: transaction.execute("DELETE FROM proyecto_pd.covid19_deaths WHERE 1=1") source_df.to_sql("covid19_deaths", con=transaction, schema="proyecto_pd", if_exists="append", index=False) os.remove(source_csv)
def insert_process(**kwargs): ti = kwargs['ti'] source_file = ti.xcom_pull(task_ids='transform_process') db_connection = MySqlHook('airflow_db').get_sqlalchemy_engine() df = source_file with db_connection.begin() as transaction: transaction.execute("TRUNCATE TABLE casos_covid") df.to_sql("casos_covid", con=transaction, schema="Mapamundi", if_exists="append", index=False)
def etl_process(**kwargs): logger.info(kwargs["execution_date"]) file_path = FSHook(FILE_CONNECTION_NAME).get_path() filename = 'time_series_covid19_deaths_global.csv' mysql_connection = MySqlHook(mysql_conn_id=CONNECTION_DB_NAME).get_sqlalchemy_engine() full_path = f'{file_path}/{filename}' df = pd.read_csv(full_path, encoding = "ISO-8859-1").rename(columns= {'Lat': 'lat', 'Long': 'lon'}) data = df.melt(id_vars=['Province/State', 'Country/Region', 'lat', 'lon'], var_name='Fecha', value_name='Casos') with mysql_connection.begin() as connection: connection.execute("DELETE FROM test.muertes WHERE 1=1") data.rename(columns=COLUMNS).to_sql('muertes', con=connection, schema='test', if_exists='append', index=False) os.remove(full_path) logger.info(f"Rows inserted {len(data.index)}")
def insert_process(**kwargs): ti = kwargs['ti'] #Extrae el archivo source_file = ti.xcom_pull(task_ids='transform_process') #Conexion a Base de Datos db_connection = MySqlHook('airflow_db').get_sqlalchemy_engine() df = pd.read_csv(source_file) with db_connection.begin() as transaction: #Elimina lo que existe en la tabla cada vez que se ejecuta transaction.execute( "DELETE FROM covid.time_series_covid19_confirmed_global WHERE 1=1") df.to_sql( "time_series_covid19_confirmed_global", #Nombre tabla con=transaction, schema="covid", if_exists="append", index=False) os.remove(source_file)
def etl_process(**kwargs): logger.info(kwargs["execution_date"]) file_path = FSHook(FILE_CONNECTION_NAME).get_path() filename = 'sales.csv' mysql_connection = MySqlHook( mysql_conn_id=CONNECTION_DB_NAME).get_sqlalchemy_engine() full_path = f'{file_path}/{filename}' df = (pd.read_csv(full_path, encoding="ISO-8859-1", usecols=COLUMNS.keys(), parse_dates=DATE_COLUMNS).rename(columns=COLUMNS)) with mysql_connection.begin() as connection: connection.execute("DELETE FROM test.sales WHERE 1=1") df.to_sql('sales', con=connection, schema='test', if_exists='append', index=False) os.remove(full_path) logger.info(f"Rows inserted {len(df.index)}")
def etl_process(**kwargs): logger.info(kwargs["execution_date"]) file_path = FSHook(FILE_CONNECTION_NAME).get_path() filename = 'time_series_covid19_confirmed_global.csv' mysql_connection = MySqlHook( mysql_conn_id=CONNECTION_DB_NAME).get_sqlalchemy_engine() full_path = f'{file_path}/{filename}' confirmed = pd.read_csv(full_path, encoding="ISO-8859-1").rename(columns={ 'Lat': 'lat', 'Long': 'lon' }) confirmed['lat'] = confirmed.lat.astype(str) confirmed['lon'] = confirmed.lon.astype(str) length_colm = len(confirmed.columns) for i in range(4, length_colm): if i == length_colm - 1: new_colname = 'diff' + confirmed.columns[i] confirmed[new_colname] = 0 elif i == 4: new_colname = 'diff' + confirmed.columns[i] confirmed[new_colname] = 0 else: new_colname = '' new_colname = 'diff' + confirmed.columns[i] confirmed[new_colname] = confirmed[confirmed.columns[ i + 1]] - confirmed[confirmed.columns[i]] first_df = confirmed.iloc[:, 0:length_colm] cols = list(range(4, length_colm)) df = confirmed df.drop(df.columns[cols], axis=1, inplace=True) second_df = df variables = ["Province/State", "Country/Region", "lat", "lon"] new_confirmed = pd.melt(frame=first_df, id_vars=variables, var_name="fecha", value_name="confirmed") new_confirmed["confirmed"] = new_confirmed["confirmed"].astype(int) new_diff = pd.melt(frame=df, id_vars=variables, var_name="fecha", value_name="Aumento") new_diff["Aumento"] = new_diff["Aumento"].astype(int) df_final = new_confirmed df_final['diff'] = new_diff['Aumento'] with mysql_connection.begin() as connection: connection.execute("DELETE FROM airflowcovid.confirmed WHERE 1=1") df_final.rename(columns=COLUMNS).to_sql('confirmed', con=connection, schema='airflowcovid', if_exists='append', index=False) os.remove(full_path) logger.info(f"Rows inserted into confirmed table in Mysql")