def transformdWeatherData_3h(): try: conn = cfg_lib.dbConnect() cur = conn.cursor() # To avoid from data duplications, data is cleaned for related meteorological station from target DWH fact table: cur.execute("DELETE FROM DWH_STG.stg_current_weather_3h ") conn.commit() try: # Data is transformed and loaded to target DWH fact table: cur.execute( "INSERT INTO DWH_STG.stg_current_weather_3h(station_id,date_id,max_temp,min_temp,rain_mm,etl_date) " "SELECT station_id,(substr(date_str,1,4)||substr(date_str,6,2)||substr(date_str,9,2)||substr(date_str,12,2))::integer,max_temp::numeric-273," "min_temp::numeric-273,rain_mm::numeric,CURRENT_TIMESTAMP FROM DWH_EXTR.current_weather " ) conn.commit() except: print "Unexpected errorStation:", sys.exc_info()[0] cur.close() conn.close() return 1 except: print "Unexpected error:", sys.exc_info()[0] return 0
def extractWeatherData_3h(city_id, stationId): try: conn = cfg_lib.dbConnect() cur = conn.cursor() # To avoid from data duplications, data is cleaned for related meteorological station: cur.execute( "DELETE FROM DWH_EXTR.current_weather WHERE station_id = %s", (stationId, )) conn.commit() # json url is created: url = "http://api.openweathermap.org/data/2.5/forecast/city?id=" + str( city_id) + "&APPID=b754a060a05af2d1885f0e7f68d96aaa" response = urllib.urlopen(url) data = json.loads(response.read()) for x in data[u'list']: try: temp_min = x['main'][u'temp_min'] except: temp_min = 0 try: temp_max = x['main'][u'temp_max'] except: temp_max = 0 try: rain_mm = x['rain']['3h'] except: rain_mm = 0 try: cur.execute( "INSERT INTO DWH_EXTR.current_weather(station_id,date_str,max_temp,min_temp,rain_mm,etl_date) VALUES (%s, %s, %s, %s, %s, CURRENT_TIMESTAMP)", ( stationId, x[u'dt_txt'], temp_max, temp_min, rain_mm, )) except: print "Unexpected errorStation:", sys.exc_info()[0] conn.commit() cur.close() conn.close() return 1 except: print "Unexpected error:", sys.exc_info()[0] return 0 ########################################################################################
def transformdWeatherData_daily(): try: conn = cfg_lib.dbConnect() cur = conn.cursor() # To avoid from data duplications, data is cleaned for related meteorological station from target DWH fact table: cur.execute("DELETE FROM DWH_STG.stg_current_weather_daily ") conn.commit() try: # Data is transformed and loaded to target DWH fact table: cur.execute( "INSERT INTO DWH_STG.stg_current_weather_daily(station_id,date_id,max_temp,min_temp,af_days,rain_mm,etl_date) " "SELECT station_id,to_char(current_date-1, 'YYYYMMDD')::integer,MAX(max_temp),MIN(min_temp),CASE WHEN MIN(min_temp) < 0 THEN 1 ELSE 0 END,SUM(rain_mm),CURRENT_TIMESTAMP " "FROM DWH_AFFINITAS.fct_weather WHERE log_type_id=3 AND substr(date_id::text,1,8)= to_char(current_date-1, 'YYYYMMDD') GROUP BY station_id " ) cur.execute( "INSERT INTO DWH_STG.stg_current_weather_daily(station_id,date_id,max_temp,min_temp,af_days,rain_mm,etl_date) " "SELECT station_id,to_char(current_date, 'YYYYMMDD')::integer,MAX(max_temp),MIN(min_temp),CASE WHEN MIN(min_temp) < 0 THEN 1 ELSE 0 END,SUM(rain_mm),CURRENT_TIMESTAMP " "FROM DWH_AFFINITAS.fct_weather WHERE log_type_id=3 AND substr(date_id::text,1,8)= to_char(current_date, 'YYYYMMDD') GROUP BY station_id " ) cur.execute( "INSERT INTO DWH_STG.stg_current_weather_daily(station_id,date_id,max_temp,min_temp,af_days,rain_mm,etl_date) " "SELECT station_id,to_char(current_date+1, 'YYYYMMDD')::integer,MAX(max_temp),MIN(min_temp),CASE WHEN MIN(min_temp) < 0 THEN 1 ELSE 0 END,SUM(rain_mm),CURRENT_TIMESTAMP " "FROM DWH_AFFINITAS.fct_weather WHERE log_type_id=3 AND substr(date_id::text,1,8)= to_char(current_date+1, 'YYYYMMDD') GROUP BY station_id " ) cur.execute( "INSERT INTO DWH_STG.stg_current_weather_daily(station_id,date_id,max_temp,min_temp,af_days,rain_mm,etl_date) " "SELECT station_id,to_char(current_date+2, 'YYYYMMDD')::integer,MAX(max_temp),MIN(min_temp),CASE WHEN MIN(min_temp) < 0 THEN 1 ELSE 0 END,SUM(rain_mm),CURRENT_TIMESTAMP " "FROM DWH_AFFINITAS.fct_weather WHERE log_type_id=3 AND substr(date_id::text,1,8)= to_char(current_date+2, 'YYYYMMDD') GROUP BY station_id " ) cur.execute( "INSERT INTO DWH_STG.stg_current_weather_daily(station_id,date_id,max_temp,min_temp,af_days,rain_mm,etl_date) " "SELECT station_id,to_char(current_date+3, 'YYYYMMDD')::integer,MAX(max_temp),MIN(min_temp),CASE WHEN MIN(min_temp) < 0 THEN 1 ELSE 0 END,SUM(rain_mm),CURRENT_TIMESTAMP " "FROM DWH_AFFINITAS.fct_weather WHERE log_type_id=3 AND substr(date_id::text,1,8)= to_char(current_date+3, 'YYYYMMDD') GROUP BY station_id " ) cur.execute( "INSERT INTO DWH_STG.stg_current_weather_daily(station_id,date_id,max_temp,min_temp,af_days,rain_mm,etl_date) " "SELECT station_id,to_char(current_date+4, 'YYYYMMDD')::integer,MAX(max_temp),MIN(min_temp),CASE WHEN MIN(min_temp) < 0 THEN 1 ELSE 0 END,SUM(rain_mm),CURRENT_TIMESTAMP " "FROM DWH_AFFINITAS.fct_weather WHERE log_type_id=3 AND substr(date_id::text,1,8)= to_char(current_date+4, 'YYYYMMDD') GROUP BY station_id " ) conn.commit() except: print "Unexpected errorStation:", sys.exc_info()[0] cur.close() conn.close() return 1 except: print "Unexpected error:", sys.exc_info()[0] return 0
def loadWeatherData_3h(): try: conn = cfg_lib.dbConnect() cur = conn.cursor() # To avoid from data duplications, data is cleaned: cur.execute( "DELETE FROM DWH_AFFINITAS.fct_weather WHERE log_type_id=3 and (station_id,date_id) IN" "(SELECT station_id,date_id FROM DWH_STG.stg_current_weather_3h)") cur.execute( "INSERT INTO DWH_AFFINITAS.fct_weather(station_id,log_type_id,date_id,max_temp,min_temp,rain_mm,etl_date)" "SELECT station_id,3,date_id,max_temp,min_temp,rain_mm,CURRENT_TIMESTAMP FROM DWH_STG.stg_current_weather_3h" ) conn.commit() cur.close() conn.close() return 1 except: print "Unexpected error:", sys.exc_info()[0] return 0
def transformHistoricData(): try: conn = cfg_lib.dbConnect() cur = conn.cursor() # To avoid from data duplications, data is cleaned for related meteorological station from target DWH fact table: cur.execute("DELETE FROM DWH_STG.stg_historic_weather ") conn.commit() cur.execute( "INSERT INTO DWH_STG.stg_historic_weather(station_id,date_id,max_temp,min_temp,af_days,rain_mm,sun_hours,etl_date)" "SELECT station_id,(year||lpad(month,2,'0'))::integer,(CASE WHEN max_temp ~ E'^([0-9]+[.]?[0-9]*|[.][0-9]+)$' THEN max_temp ELSE NULL END)::numeric," "(CASE WHEN min_temp ~ E'^([0-9]+[.]?[0-9]*|[.][0-9]+)$' THEN min_temp ELSE NULL END)::numeric,(CASE WHEN af_days ~ E'^\\d+$' THEN af_days ELSE NULL END)::integer," "(CASE WHEN rain_mm ~ E'^([0-9]+[.]?[0-9]*|[.][0-9]+)$' THEN rain_mm ELSE NULL END)::numeric,(CASE WHEN sun_hours ~ E'^([0-9]+[.]?[0-9]*|[.][0-9]+)$' THEN sun_hours ELSE NULL END)::numeric," "CURRENT_TIMESTAMP FROM DWH_EXTR.monthly_historic_weather ") conn.commit() cur.close() conn.close() return 1 except: print "Unexpected error:", sys.exc_info()[0] return 0
def transformdWeatherData_monthly(): try: conn = cfg_lib.dbConnect() cur = conn.cursor() # To avoid from data duplications, data is cleaned: cur.execute("DELETE FROM DWH_STG.stg_current_weather_monthly ") conn.commit() cur.execute( "INSERT INTO DWH_STG.stg_current_weather_monthly(station_id,date_id,max_temp,min_temp,af_days,rain_mm,etl_date) SELECT station_id,to_char(current_date, 'YYYYMM')::integer,MAX(max_temp)," "MIN(min_temp),SUM(af_days),SUM(rain_mm),CURRENT_TIMESTAMP FROM DWH_AFFINITAS.fct_weather WHERE log_type_id=2 AND substr(date_id::text,1,6)= to_char(current_date, 'YYYYMM') GROUP BY station_id" ) conn.commit() cur.close() conn.close() return 1 except: print "Unexpected error:", sys.exc_info()[0] return 0 ########################################################################################
def extractHistoricDataFromUrl(url, stationId): try: conn = cfg_lib.dbConnect() cur = conn.cursor() # To avoid from http 403 error, artificial header should be added to read request: hdr = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive' } # Data read from source URL: readRequest = urllib2.Request(url, headers=hdr) rawData = urllib2.urlopen(readRequest) # To avoid from data duplications, data is cleaned for related meteorological station: cur.execute( "DELETE FROM DWH_EXTR.monthly_historic_weather WHERE station_id = %s", (stationId, )) conn.commit() # Raw historical weather data are inserted to extraction table: for line in rawData: x = line.split() if x[0].isdigit() and int(x[0]) > 1700 and int(x[0]) < 2100: try: cur.execute( "INSERT INTO DWH_EXTR.monthly_historic_weather (station_id,year,month,max_temp,min_temp,af_days,rain_mm,sun_hours,etl_date) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, CURRENT_TIMESTAMP)", ( stationId, x[0], x[1], x[2], x[3], x[4], x[5], x[6], )) except: print "Error: ", x[0], " ", x[1] print "Unexpected errorStation:", sys.exc_info()[0] # Extraction errors are saved for further manuel corrections: cur.execute( "INSERT INTO DWH_EXTR.monthly_historic_weather_error (station_id,year,month,raw_data,etl_date) VALUES (%s, %s, %s, %s, CURRENT_TIMESTAMP)", ( stationId, x[0], x[1], line, )) conn.commit() cur.close() conn.close() return 1 except: print "Unexpected error:", sys.exc_info()[0] return 0
##################################################### # AFFINITAS DWH TASK by ONUR SARKAN # This script is main script of task package. # All ETL jobs are executed in this file. ##################################################### import extr_lib import transform_lib import load_lib import cfg_lib import sys try: # Location info are collected for ETL from location dimension table. conn = cfg_lib.dbConnect() cur = conn.cursor() cur.execute("""SELECT station_id, data_url, city_id FROM DWH_AFFINITAS.dim_location""") rows = cur.fetchall() print "Data extraction begin:" for row in rows: print "Data is loading for ", row[1]," \n" #Unfortunately, historic files are not available anymore: #extr_lib.extractHistoricDataFromUrl(row[1], row[0]) extr_lib.extractWeatherData_3h(row[2], row[0]) print "Data transform begin:" transform_lib.transformHistoricData() transform_lib.transformdWeatherData_3h() transform_lib.transformdWeatherData_daily() transform_lib.transformdWeatherData_monthly()