Ejemplo n.º 1
0
def transformdWeatherData_3h():
    try:
        conn = cfg_lib.dbConnect()
        cur = conn.cursor()

        # To avoid from data duplications, data is cleaned for related meteorological station from target DWH fact table:
        cur.execute("DELETE FROM DWH_STG.stg_current_weather_3h ")
        conn.commit()

        try:
            # Data is transformed and loaded to target DWH fact table:
            cur.execute(
                "INSERT INTO DWH_STG.stg_current_weather_3h(station_id,date_id,max_temp,min_temp,rain_mm,etl_date) "
                "SELECT station_id,(substr(date_str,1,4)||substr(date_str,6,2)||substr(date_str,9,2)||substr(date_str,12,2))::integer,max_temp::numeric-273,"
                "min_temp::numeric-273,rain_mm::numeric,CURRENT_TIMESTAMP FROM DWH_EXTR.current_weather "
            )
            conn.commit()
        except:
            print "Unexpected errorStation:", sys.exc_info()[0]
        cur.close()
        conn.close()
        return 1
    except:
        print "Unexpected error:", sys.exc_info()[0]
        return 0
Ejemplo n.º 2
0
def extractWeatherData_3h(city_id, stationId):
    try:
        conn = cfg_lib.dbConnect()
        cur = conn.cursor()

        # To avoid from data duplications, data is cleaned for related meteorological station:
        cur.execute(
            "DELETE FROM DWH_EXTR.current_weather WHERE station_id = %s",
            (stationId, ))
        conn.commit()

        # json url is created:
        url = "http://api.openweathermap.org/data/2.5/forecast/city?id=" + str(
            city_id) + "&APPID=b754a060a05af2d1885f0e7f68d96aaa"
        response = urllib.urlopen(url)
        data = json.loads(response.read())

        for x in data[u'list']:
            try:
                temp_min = x['main'][u'temp_min']
            except:
                temp_min = 0
            try:
                temp_max = x['main'][u'temp_max']
            except:
                temp_max = 0
            try:
                rain_mm = x['rain']['3h']
            except:
                rain_mm = 0
            try:
                cur.execute(
                    "INSERT INTO DWH_EXTR.current_weather(station_id,date_str,max_temp,min_temp,rain_mm,etl_date) VALUES (%s, %s, %s, %s, %s, CURRENT_TIMESTAMP)",
                    (
                        stationId,
                        x[u'dt_txt'],
                        temp_max,
                        temp_min,
                        rain_mm,
                    ))
            except:
                print "Unexpected errorStation:", sys.exc_info()[0]
        conn.commit()
        cur.close()
        conn.close()
        return 1
    except:
        print "Unexpected error:", sys.exc_info()[0]
        return 0


########################################################################################
Ejemplo n.º 3
0
def transformdWeatherData_daily():
    try:
        conn = cfg_lib.dbConnect()
        cur = conn.cursor()

        # To avoid from data duplications, data is cleaned for related meteorological station from target DWH fact table:
        cur.execute("DELETE FROM DWH_STG.stg_current_weather_daily ")
        conn.commit()

        try:
            # Data is transformed and loaded to target DWH fact table:
            cur.execute(
                "INSERT INTO DWH_STG.stg_current_weather_daily(station_id,date_id,max_temp,min_temp,af_days,rain_mm,etl_date) "
                "SELECT station_id,to_char(current_date-1, 'YYYYMMDD')::integer,MAX(max_temp),MIN(min_temp),CASE WHEN MIN(min_temp) < 0 THEN 1 ELSE 0 END,SUM(rain_mm),CURRENT_TIMESTAMP "
                "FROM DWH_AFFINITAS.fct_weather WHERE log_type_id=3 AND substr(date_id::text,1,8)= to_char(current_date-1, 'YYYYMMDD') GROUP BY station_id "
            )
            cur.execute(
                "INSERT INTO DWH_STG.stg_current_weather_daily(station_id,date_id,max_temp,min_temp,af_days,rain_mm,etl_date) "
                "SELECT station_id,to_char(current_date, 'YYYYMMDD')::integer,MAX(max_temp),MIN(min_temp),CASE WHEN MIN(min_temp) < 0 THEN 1 ELSE 0 END,SUM(rain_mm),CURRENT_TIMESTAMP "
                "FROM DWH_AFFINITAS.fct_weather WHERE log_type_id=3 AND substr(date_id::text,1,8)= to_char(current_date, 'YYYYMMDD') GROUP BY station_id "
            )
            cur.execute(
                "INSERT INTO DWH_STG.stg_current_weather_daily(station_id,date_id,max_temp,min_temp,af_days,rain_mm,etl_date) "
                "SELECT station_id,to_char(current_date+1, 'YYYYMMDD')::integer,MAX(max_temp),MIN(min_temp),CASE WHEN MIN(min_temp) < 0 THEN 1 ELSE 0 END,SUM(rain_mm),CURRENT_TIMESTAMP "
                "FROM DWH_AFFINITAS.fct_weather WHERE log_type_id=3 AND substr(date_id::text,1,8)= to_char(current_date+1, 'YYYYMMDD') GROUP BY station_id "
            )
            cur.execute(
                "INSERT INTO DWH_STG.stg_current_weather_daily(station_id,date_id,max_temp,min_temp,af_days,rain_mm,etl_date) "
                "SELECT station_id,to_char(current_date+2, 'YYYYMMDD')::integer,MAX(max_temp),MIN(min_temp),CASE WHEN MIN(min_temp) < 0 THEN 1 ELSE 0 END,SUM(rain_mm),CURRENT_TIMESTAMP "
                "FROM DWH_AFFINITAS.fct_weather WHERE log_type_id=3 AND substr(date_id::text,1,8)= to_char(current_date+2, 'YYYYMMDD') GROUP BY station_id "
            )
            cur.execute(
                "INSERT INTO DWH_STG.stg_current_weather_daily(station_id,date_id,max_temp,min_temp,af_days,rain_mm,etl_date) "
                "SELECT station_id,to_char(current_date+3, 'YYYYMMDD')::integer,MAX(max_temp),MIN(min_temp),CASE WHEN MIN(min_temp) < 0 THEN 1 ELSE 0 END,SUM(rain_mm),CURRENT_TIMESTAMP "
                "FROM DWH_AFFINITAS.fct_weather WHERE log_type_id=3 AND substr(date_id::text,1,8)= to_char(current_date+3, 'YYYYMMDD') GROUP BY station_id "
            )
            cur.execute(
                "INSERT INTO DWH_STG.stg_current_weather_daily(station_id,date_id,max_temp,min_temp,af_days,rain_mm,etl_date) "
                "SELECT station_id,to_char(current_date+4, 'YYYYMMDD')::integer,MAX(max_temp),MIN(min_temp),CASE WHEN MIN(min_temp) < 0 THEN 1 ELSE 0 END,SUM(rain_mm),CURRENT_TIMESTAMP "
                "FROM DWH_AFFINITAS.fct_weather WHERE log_type_id=3 AND substr(date_id::text,1,8)= to_char(current_date+4, 'YYYYMMDD') GROUP BY station_id "
            )
            conn.commit()

        except:
            print "Unexpected errorStation:", sys.exc_info()[0]
        cur.close()
        conn.close()
        return 1
    except:
        print "Unexpected error:", sys.exc_info()[0]
        return 0
Ejemplo n.º 4
0
def loadWeatherData_3h():
    try:
        conn = cfg_lib.dbConnect()
        cur = conn.cursor()

        # To avoid from data duplications, data is cleaned:
        cur.execute(
            "DELETE FROM DWH_AFFINITAS.fct_weather WHERE log_type_id=3 and (station_id,date_id) IN"
            "(SELECT station_id,date_id FROM DWH_STG.stg_current_weather_3h)")
        cur.execute(
            "INSERT INTO DWH_AFFINITAS.fct_weather(station_id,log_type_id,date_id,max_temp,min_temp,rain_mm,etl_date)"
            "SELECT station_id,3,date_id,max_temp,min_temp,rain_mm,CURRENT_TIMESTAMP FROM DWH_STG.stg_current_weather_3h"
        )
        conn.commit()

        cur.close()
        conn.close()
        return 1
    except:
        print "Unexpected error:", sys.exc_info()[0]
        return 0
Ejemplo n.º 5
0
def transformHistoricData():
    try:
        conn = cfg_lib.dbConnect()
        cur = conn.cursor()

        # To avoid from data duplications, data is cleaned for related meteorological station from target DWH fact table:
        cur.execute("DELETE FROM DWH_STG.stg_historic_weather ")
        conn.commit()

        cur.execute(
            "INSERT INTO DWH_STG.stg_historic_weather(station_id,date_id,max_temp,min_temp,af_days,rain_mm,sun_hours,etl_date)"
            "SELECT station_id,(year||lpad(month,2,'0'))::integer,(CASE WHEN max_temp ~ E'^([0-9]+[.]?[0-9]*|[.][0-9]+)$' THEN max_temp ELSE NULL END)::numeric,"
            "(CASE WHEN min_temp ~ E'^([0-9]+[.]?[0-9]*|[.][0-9]+)$' THEN min_temp ELSE NULL END)::numeric,(CASE WHEN af_days ~ E'^\\d+$' THEN af_days ELSE NULL END)::integer,"
            "(CASE WHEN rain_mm ~ E'^([0-9]+[.]?[0-9]*|[.][0-9]+)$' THEN rain_mm ELSE NULL END)::numeric,(CASE WHEN sun_hours ~ E'^([0-9]+[.]?[0-9]*|[.][0-9]+)$' THEN sun_hours ELSE NULL END)::numeric,"
            "CURRENT_TIMESTAMP FROM DWH_EXTR.monthly_historic_weather ")
        conn.commit()

        cur.close()
        conn.close()
        return 1
    except:
        print "Unexpected error:", sys.exc_info()[0]
        return 0
Ejemplo n.º 6
0
def transformdWeatherData_monthly():
    try:
        conn = cfg_lib.dbConnect()
        cur = conn.cursor()

        # To avoid from data duplications, data is cleaned:
        cur.execute("DELETE FROM DWH_STG.stg_current_weather_monthly ")
        conn.commit()

        cur.execute(
            "INSERT INTO DWH_STG.stg_current_weather_monthly(station_id,date_id,max_temp,min_temp,af_days,rain_mm,etl_date) SELECT station_id,to_char(current_date, 'YYYYMM')::integer,MAX(max_temp),"
            "MIN(min_temp),SUM(af_days),SUM(rain_mm),CURRENT_TIMESTAMP FROM DWH_AFFINITAS.fct_weather WHERE log_type_id=2 AND substr(date_id::text,1,6)= to_char(current_date, 'YYYYMM') GROUP BY station_id"
        )
        conn.commit()

        cur.close()
        conn.close()
        return 1
    except:
        print "Unexpected error:", sys.exc_info()[0]
        return 0


########################################################################################
Ejemplo n.º 7
0
def extractHistoricDataFromUrl(url, stationId):
    try:
        conn = cfg_lib.dbConnect()
        cur = conn.cursor()

        # To avoid from http 403 error, artificial header should be added to read request:
        hdr = {
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            'Accept-Encoding': 'none',
            'Accept-Language': 'en-US,en;q=0.8',
            'Connection': 'keep-alive'
        }

        # Data read from source URL:
        readRequest = urllib2.Request(url, headers=hdr)
        rawData = urllib2.urlopen(readRequest)

        # To avoid from data duplications, data is cleaned for related meteorological station:
        cur.execute(
            "DELETE FROM DWH_EXTR.monthly_historic_weather WHERE station_id = %s",
            (stationId, ))
        conn.commit()

        # Raw historical weather data are inserted to extraction table:
        for line in rawData:
            x = line.split()

            if x[0].isdigit() and int(x[0]) > 1700 and int(x[0]) < 2100:
                try:
                    cur.execute(
                        "INSERT INTO DWH_EXTR.monthly_historic_weather (station_id,year,month,max_temp,min_temp,af_days,rain_mm,sun_hours,etl_date) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, CURRENT_TIMESTAMP)",
                        (
                            stationId,
                            x[0],
                            x[1],
                            x[2],
                            x[3],
                            x[4],
                            x[5],
                            x[6],
                        ))
                except:
                    print "Error: ", x[0], " ", x[1]
                    print "Unexpected errorStation:", sys.exc_info()[0]
                    # Extraction errors are saved for further manuel corrections:
                    cur.execute(
                        "INSERT INTO DWH_EXTR.monthly_historic_weather_error (station_id,year,month,raw_data,etl_date) VALUES (%s, %s, %s, %s, CURRENT_TIMESTAMP)",
                        (
                            stationId,
                            x[0],
                            x[1],
                            line,
                        ))
            conn.commit()
        cur.close()
        conn.close()
        return 1
    except:
        print "Unexpected error:", sys.exc_info()[0]
        return 0
Ejemplo n.º 8
0
#####################################################
# AFFINITAS DWH TASK by ONUR SARKAN
# This script is main script of task package.
# All ETL jobs are executed in this file.
#####################################################
import extr_lib
import transform_lib
import load_lib
import cfg_lib
import sys

try:
    # Location info  are collected for ETL from location dimension table.
    conn = cfg_lib.dbConnect()
    cur = conn.cursor()
    cur.execute("""SELECT station_id, data_url, city_id FROM DWH_AFFINITAS.dim_location""")
    rows = cur.fetchall()

    print "Data extraction begin:"
    for row in rows:
        print "Data is loading for ", row[1]," \n"
        #Unfortunately, historic files are not available anymore:
        #extr_lib.extractHistoricDataFromUrl(row[1], row[0])
        extr_lib.extractWeatherData_3h(row[2], row[0])


    print "Data transform begin:"
    transform_lib.transformHistoricData()
    transform_lib.transformdWeatherData_3h()
    transform_lib.transformdWeatherData_daily()
    transform_lib.transformdWeatherData_monthly()