コード例 #1
0
def integration_procces(**kwargs):
    db_connection = MySqlHook('airflow_db').get_sqlalchemy_engine()
    with db_connection.begin() as transaction:
        df_C = pd.read_sql_table("confirmed", con=transaction, schema='covid')
        df_R = pd.read_sql_table("recovered", con=transaction, schema='covid')
        df_D = pd.read_sql_table("deaths", con=transaction, schema='covid')
    df_C = df_C.drop(columns=['id'])
    df_R = df_R.drop(columns=['id', 'lat', 'long'])
    df_D = df_D.drop(columns=['id', 'lat', 'long'])
    df_C['province_state'] = df_C['province_state'].fillna('')
    df_R['province_state'] = df_R['province_state'].fillna('')
    df_D['province_state'] = df_D['province_state'].fillna('')
    df_C = df_C.rename(columns=COLUMNS_C)
    df_R = df_R.rename(columns=COLUMNS_R)
    df_D = df_D.rename(columns=COLUMNS_D)
    df = pd.merge(df_C,
                  df_R,
                  on=['country_region', 'province_state', 'event_date'])
    df = pd.merge(df,
                  df_D,
                  on=['country_region', 'province_state', 'event_date'])
    df['mortality_rate'] = df['d_cases'] / df['c_cases']
    df['recovery_rate'] = df['r_cases'] / df['c_cases']
    #df_final = df[COLUMNS_VIEW]
    df_final = df
    with db_connection.begin() as transaction:
        transaction.execute("DELETE FROM covid.cases_data WHERE 1=1")
        df_final.to_sql("cases_data",
                        con=transaction,
                        schema="covid",
                        if_exists="append",
                        index=False)
コード例 #2
0
def etl_process(**kwargs):
    logger.info(kwargs["execution_date"])
    file_path = FSHook(FILE_CONNECTION_NAME).get_path()
    filename = 'time_series_covid19_deaths_global.csv'
    mysql_connection = MySqlHook(
        mysql_conn_id=CONNECTION_DB_NAME).get_sqlalchemy_engine()
    full_path = f'{file_path}/{filename}'
    logger.info(full_path)
    df = pd.read_csv(full_path)
    df = pd.melt(df,
                 id_vars=['Lat', 'Long', 'Province/State', 'Country/Region'],
                 var_name="RegDate",
                 value_name="Count")
    df = df[df["Count"] > 0]
    df = df.rename(columns={
        'Province/State': 'State',
        'Country/Region': 'Country'
    })
    df['RegDate'] = pd.to_datetime(df['RegDate'])
    df['Type'] = 'D'
    with mysql_connection.begin() as connection:
        connection.execute("DELETE FROM Covid.Cases WHERE Type='D'")
        df.to_sql('Cases',
                  con=connection,
                  schema='Covid',
                  if_exists='append',
                  index=False)

    os.remove(full_path)

    logger.info(f"Rows inserted confirmed {len(df.index)}")
コード例 #3
0
def etl_process(**kwargs):
    logger.info(kwargs["execution_date"])
    file_path = FSHook(FILE_CONNECTION_NAME).get_path()
    filename = 'time_series_covid19_recovered_global.csv'
    mysql_connection = MySqlHook(mysql_conn_id=CONNECTION_DB_NAME).get_sqlalchemy_engine()
    full_path = f'{file_path}/{filename}'

    recovered = pd.read_csv(full_path, encoding = "ISO-8859-1").rename(columns= {'Lat': 'lat', 'Long': 'lon'})
    recovered['lat'] = recovered.lat.astype(str)
    recovered['lon'] = recovered.lon.astype(str)

    variables = [
        "Province/State",
        "Country/Region",
        "lat",
        "lon"
    ]

    new_recovered = pd.melt(frame=recovered, id_vars= variables, var_name="fecha",value_name="recovered")
    new_recovered["recovered"] = new_recovered["recovered"].astype(int)


    with mysql_connection.begin() as connection:
        connection.execute("DELETE FROM airflowcovid.recoverd WHERE 1=1")
        new_recovered.rename(columns=COLUMNS).to_sql('recoverd', con=connection, schema='airflowcovid', if_exists='append', index=False)

    os.remove(full_path)

    logger.info(f"Rows inserted into recoverd table in Mysql")
コード例 #4
0
def insert_process(**kwargs):
    ti = kwargs['ti']
    source_file = ti.xcom_pull(task_ids='transform_process')
    db_connection = MySqlHook('airflow_db').get_sqlalchemy_engine()
    df = pd.read_csv(source_file)
    with db_connection.begin() as transaction:
        transaction.execute("DELETE FROM covid.confirmed WHERE 1=1")
        df.to_sql("confirmed", con=transaction, schema="covid", if_exists="append",
                  index=False)
    os.remove(source_file)
コード例 #5
0
def etl_process(**kwargs):
    logger.info(kwargs["execution_date"])
    mysql_connection = MySqlHook(mysql_conn_id=CONNECTION_DB_NAME).get_sqlalchemy_engine()

    df = pd.read_sql(QUERY, con=mysql_connection, coerce_float=False)

    with mysql_connection.begin() as connection:
        connection.execute("DELETE FROM test.consolidate_sales WHERE 1=1")
        df.to_sql("consolidate_sales", con=connection, schema="test", if_exists="append", index=False)

    logger.info(f"Rows inserted {len(df.index)}")
コード例 #6
0
def insert_process(**kwargs):
    ti = kwargs['ti']
    source_csv = ti.xcom_pull(task_ids='transform_process')
    source_df = pd.read_csv(source_csv)
    db_connection = MySqlHook('db_proyecto').get_sqlalchemy_engine()
    
    with db_connection.begin() as transaction:
        transaction.execute("DELETE FROM proyecto_pd.covid19_deaths WHERE 1=1")
        source_df.to_sql("covid19_deaths", con=transaction,
                         schema="proyecto_pd", if_exists="append", index=False)

    os.remove(source_csv)
コード例 #7
0
def insert_process(**kwargs):
    ti = kwargs['ti']
    source_file = ti.xcom_pull(task_ids='transform_process')

    db_connection = MySqlHook('airflow_db').get_sqlalchemy_engine()
    df = source_file

    with db_connection.begin() as transaction:
        transaction.execute("TRUNCATE TABLE casos_covid")
        df.to_sql("casos_covid",
                  con=transaction,
                  schema="Mapamundi",
                  if_exists="append",
                  index=False)
コード例 #8
0
def etl_process(**kwargs):
    logger.info(kwargs["execution_date"])
    file_path = FSHook(FILE_CONNECTION_NAME).get_path()
    filename = 'time_series_covid19_deaths_global.csv'
    mysql_connection = MySqlHook(mysql_conn_id=CONNECTION_DB_NAME).get_sqlalchemy_engine()
    full_path = f'{file_path}/{filename}'
    df = pd.read_csv(full_path, encoding = "ISO-8859-1").rename(columns= {'Lat': 'lat', 'Long': 'lon'})
    data = df.melt(id_vars=['Province/State', 'Country/Region', 'lat', 'lon'], var_name='Fecha', value_name='Casos')
    with mysql_connection.begin() as connection:
        connection.execute("DELETE FROM test.muertes WHERE 1=1")
        data.rename(columns=COLUMNS).to_sql('muertes', con=connection, schema='test', if_exists='append', index=False)

    os.remove(full_path)

    logger.info(f"Rows inserted {len(data.index)}")
コード例 #9
0
def insert_process(**kwargs):
    ti = kwargs['ti']
    #Extrae el archivo
    source_file = ti.xcom_pull(task_ids='transform_process')
    #Conexion a Base de Datos
    db_connection = MySqlHook('airflow_db').get_sqlalchemy_engine()

    df = pd.read_csv(source_file)

    with db_connection.begin() as transaction:
        #Elimina lo que existe en la tabla cada vez que se ejecuta
        transaction.execute(
            "DELETE FROM covid.time_series_covid19_confirmed_global WHERE 1=1")
        df.to_sql(
            "time_series_covid19_confirmed_global",  #Nombre tabla
            con=transaction,
            schema="covid",
            if_exists="append",
            index=False)
    os.remove(source_file)
コード例 #10
0
ファイル: sales_dag.py プロジェクト: rub3ng0nzalez/MSDegree
def etl_process(**kwargs):
    logger.info(kwargs["execution_date"])
    file_path = FSHook(FILE_CONNECTION_NAME).get_path()
    filename = 'sales.csv'
    mysql_connection = MySqlHook(
        mysql_conn_id=CONNECTION_DB_NAME).get_sqlalchemy_engine()
    full_path = f'{file_path}/{filename}'
    df = (pd.read_csv(full_path,
                      encoding="ISO-8859-1",
                      usecols=COLUMNS.keys(),
                      parse_dates=DATE_COLUMNS).rename(columns=COLUMNS))

    with mysql_connection.begin() as connection:
        connection.execute("DELETE FROM test.sales WHERE 1=1")
        df.to_sql('sales',
                  con=connection,
                  schema='test',
                  if_exists='append',
                  index=False)

    os.remove(full_path)

    logger.info(f"Rows inserted {len(df.index)}")
コード例 #11
0
def etl_process(**kwargs):
    logger.info(kwargs["execution_date"])
    file_path = FSHook(FILE_CONNECTION_NAME).get_path()
    filename = 'time_series_covid19_confirmed_global.csv'
    mysql_connection = MySqlHook(
        mysql_conn_id=CONNECTION_DB_NAME).get_sqlalchemy_engine()
    full_path = f'{file_path}/{filename}'
    confirmed = pd.read_csv(full_path,
                            encoding="ISO-8859-1").rename(columns={
                                'Lat': 'lat',
                                'Long': 'lon'
                            })
    confirmed['lat'] = confirmed.lat.astype(str)
    confirmed['lon'] = confirmed.lon.astype(str)

    length_colm = len(confirmed.columns)
    for i in range(4, length_colm):
        if i == length_colm - 1:
            new_colname = 'diff' + confirmed.columns[i]
            confirmed[new_colname] = 0
        elif i == 4:
            new_colname = 'diff' + confirmed.columns[i]
            confirmed[new_colname] = 0
        else:
            new_colname = ''
            new_colname = 'diff' + confirmed.columns[i]
            confirmed[new_colname] = confirmed[confirmed.columns[
                i + 1]] - confirmed[confirmed.columns[i]]

    first_df = confirmed.iloc[:, 0:length_colm]
    cols = list(range(4, length_colm))
    df = confirmed
    df.drop(df.columns[cols], axis=1, inplace=True)
    second_df = df

    variables = ["Province/State", "Country/Region", "lat", "lon"]

    new_confirmed = pd.melt(frame=first_df,
                            id_vars=variables,
                            var_name="fecha",
                            value_name="confirmed")
    new_confirmed["confirmed"] = new_confirmed["confirmed"].astype(int)

    new_diff = pd.melt(frame=df,
                       id_vars=variables,
                       var_name="fecha",
                       value_name="Aumento")
    new_diff["Aumento"] = new_diff["Aumento"].astype(int)

    df_final = new_confirmed
    df_final['diff'] = new_diff['Aumento']

    with mysql_connection.begin() as connection:
        connection.execute("DELETE FROM airflowcovid.confirmed WHERE 1=1")
        df_final.rename(columns=COLUMNS).to_sql('confirmed',
                                                con=connection,
                                                schema='airflowcovid',
                                                if_exists='append',
                                                index=False)

    os.remove(full_path)

    logger.info(f"Rows inserted into confirmed table in Mysql")