Ejemplo n.º 1
0
 def test_etl(self):
     df = storage.extract_data_from_csv(self.file_full_path)
     self.assertEqual(len(df.index), 3)
     df = task.transform_data_func(df)
     self.assertEqual(len(df.index), 2)
     self.assertEqual(
         storage.load_data_into_db(df, self.db_name, self.table_name), True)
def load_data(ds, **kwargs):
    df = kwargs['task_instance'].xcom_pull(task_ids='transform_data')
    storage.load_data_into_db(df, db_name, table_name)  
def sales_fact(ds, **kwargs):
    # CREATE TEMPORARY TABLES
    query = '''
    DROP TABLE IF EXISTS olist_db.temp_city;
    CREATE TABLE olist_db.temp_city
    SELECT 
    location.city_id,
    location.state_id,
    location.city,
    location.state,
    customers_dataset.customer_id
    FROM
    (SELECT city_id, location_state.state_id, city, state FROM olist_db.d_city AS location_city
    INNER JOIN olist_db.d_state AS location_state ON location_city.state_id = location_state.state_id) AS location,
    (SELECT customers_dataset.customer_id, customer_city AS city, customer_state AS state FROM olist_db.olist_customers_dataset AS customers_dataset
    INNER JOIN olist_db.olist_orders_dataset AS orders_datase ON orders_datase.customer_id = customers_dataset.customer_id) AS customers_dataset
    WHERE customers_dataset.city = location.city
    AND customers_dataset.state = location.state;
    '''
    print('temp_city')
    query_execute(query,'temp_city')

    query = '''
    DROP TABLE IF EXISTS olist_db.temp_payment;
    SET @rownr=0;
    CREATE TABLE olist_db.temp_payment
    SELECT @rownr:=@rownr+1 AS payment_id, type_id, order_id, payment_sequential, payment_installments, payment_value FROM olist_db.olist_order_payments_dataset AS payments_dataset
    INNER JOIN olist_db.d_payment_type AS payment_type ON payment_type.payment_type = payments_dataset.payment_type;
    '''
    print('temp_payment')
    query_execute(query,'temp_payment')                      

    # LOAD FACT SALES
    f_sales = pd.DataFrame()
    with storage.engine_connect().begin() as connection:
        f_sales = pd.read_sql("""
            SELECT 
            orders_dataset.order_id
            , product_id
            , city_id
            , payment_id
            , review_id
            , (SELECT hour_id FROM olist_db.d_hour WHERE hour = HOUR(order_approved_at)) AS hour_id
            , (SELECT day_id FROM olist_db.d_day WHERE day = DAY(order_approved_at)) AS day_id
            , (SELECT month_id FROM olist_db.d_month WHERE month = MONTH(order_approved_at)) AS month_id
            , (SELECT year_id FROM olist_db.d_year WHERE year = YEAR(order_approved_at)) AS year_id
            , order_items_dataset.price
            FROM 
            olist_db.olist_orders_dataset AS orders_dataset
            INNER JOIN olist_db.olist_order_items_dataset AS order_items_dataset ON order_items_dataset.order_id = orders_dataset.order_id
            INNER JOIN olist_db.temp_payment AS temp_payment ON temp_payment.order_id = orders_dataset.order_id
            INNER JOIN olist_db.olist_customers_dataset AS customers_dataset ON customers_dataset.customer_id = orders_dataset.customer_id
            INNER JOIN olist_db.temp_city AS temp_city ON temp_city.customer_id = customers_dataset.customer_id
            INNER JOIN olist_db.olist_order_reviews_dataset AS olist_order_reviews_dataset ON olist_order_reviews_dataset.order_id = orders_dataset.order_id
            WHERE order_approved_at IS NOT NULL;
        """, connection)
    storage.load_data_into_db(f_sales, 'olist_db', 'f_sales') 
    
    # DROP TEMP TABLES
    query = '''
    DROP TABLE IF EXISTS olist_db.temp_city;
    DROP TABLE IF EXISTS olist_db.temp_payment;
    '''
    query_execute(query,'f_sales')