def func1(): pgconn = MySQLdb.Connect(host="192.168.1.42",port=3307, user="******",passwd="0.618", db="test",charset="utf8") connection = pygrametl.ConnectionWrapper(pgconn) connection.setasdefault() student = CSVSource(file("./resource/student.txt",'r',100000),delimiter=',') studentdim=CachedDimension(name="Student", key="id", attributes=("studentid",'name',"birthday"), lookupatts=("studentid",)) score = CSVSource(file("./resource/studentscore.txt",'r',100000),delimiter=',') scoredim=CachedDimension(name="StudentScore", key="id", attributes=("studentid",'coursename',"score"), lookupatts=("studentid",)) mjdata = MergeJoiningSource(student,'no',score,'no') for row in mjdata: row['birthday'] = datetime.strptime(row['birthday'],'%Y-%m-%d').date()#pygrametl.getdate(connection,row['birthday']) # Convert to an date studentdim.ensure(row, {'studentid':'no'}) scoredim.ensure(row, {'studentid':'no','coursename':'course'}) connection.commit()
def func1(): pgconn = MySQLdb.Connect(host="192.168.1.42",port=3307, user="******",passwd="0.618", db="Data_Fundamental_Master_Genius",charset="utf8") connection = pygrametl.ConnectionWrapper(pgconn) connection.setasdefault() rehabdata = CSVSource(file(ur"d:\temp\ReHis000001.csv",'r',10000),delimiter=',') rehabdim=CachedDimension(name="SFM_StockAdjPrices", key="RecordID", attributes=("MarketCode",'StockListID',"TradeDate","PreClosePrice","OpenPrice","ClosePrice","HighPrice","LowPrice","PriceAdjType"), lookupatts=("MarketCode",'StockListID',"TradeDate","PriceAdjType")) title_mapping={"StockListID":"stockcode", "TradeDate":"tradedate", "PreClosePrice":"preClosePrice", "OpenPrice":"OpenClosePrice", "ClosePrice":"ClosePrice"} for row in rehabdata: row['MarketCode'] = 102 row['PriceAdjType'] = 1 rehabdim.ensure(row, title_mapping) connection.commit()
def cls_to_pygram_dim(cls, schema_name, lookup_fields=[]): # cls.cls_init_cols() # if not lookup_fields: lookup_fields = cls.cls_get_lookup_fields() if lookup_fields: dim = CachedDimension(name=schema_name + '.' + cls.cls_get_name(), key='id', attributes=cls.cls_get_column_names_no_id(), lookupatts=lookup_fields, cachefullrows=True) else: dim = Dimension(name=schema_name + '.' + cls.cls_get_name(), key='id', attributes=cls.cls_get_column_names_no_id()) return dim
# Convert the date from a string to a python `Date` object. date = datetime.strptime(date, '%Y-%m-%d').date() row['location_year'] = date.year # The year for which to retrieve the GDP is hard-coded to simplify the ETL # process, and because the data only covers 2012. row['gdp'] = pygrametl.getvalue(row, '2012', namemapping) return row # Data dimensions locationdim = CachedDimension( name='Location', key='location_skey', attributes=['location_type', 'location_key', 'city', 'country', 'gdp', 'population', 'life_expectancy', 'anav_income', 'location_year'], lookupatts=['location_key'], rowexpander=locationhandling) productdim = CachedDimension( name='Product', key='product_skey', attributes=['product_key', 'product_name', 'category', 'energy', 'carbohydrates', 'fat', 'protein', 'product_year'], lookupatts=['product_key'], rowexpander=producthandling) datedim = CachedDimension( name='Date', key='date_key',
} cnx = sql.connect(**login) cur = cnx.cursor() connection = etl.ConnectionWrapper(cnx) connection.setasdefault() # define dimension object for ETL # Note that: # - pygrametl object table names are DIM_xxx, FCT_yyy # - MS SQL schema.table names are DIM.xxx, FCT.yyy DIM_AFSLUITREDEN = CachedDimension( name='DIM.AFSLUITREDEN', key='afs_id', attributes=['afs_afsluitreden_code'], size=0, prefill=True ) DIM_BEHANDELING = CachedDimension( name='DIM.BEHANDELING', key='beh_id', attributes=['beh_dbc_specialisme_code', 'beh_dbc_behandeling_code'], size=0, prefill=True ) DIM_DAG = CachedDimension( name='DIM.DAG', key='dag_id',
# null=str(nullval), columns=atts) def datehandling(row): readdatetime = datetime.datetime.strptime(row['EndTime'], '%Y-%m-%d %H:%M:%S') row['readdate'] = readdatetime.strftime('%Y-%m-%d') row['readtime'] = readdatetime.hour def accounttypehandling(row): row['type'] = row['subtype'].split('-')[0] usage_type_dim = CachedDimension(name='smas_water_Type', key='typeid', lookupatts=['subtype'], attributes=['type', 'subtype']) customer_dim = CachedDimension( name='smas_water_customer', key='custid', lookupatts=['accountno'], attributes=['accountno', 'street', 'city', 'province', 'postcode']) meter_dim = CachedDimension(name='smas_water_meter', key='meterid', lookupatts=['meterno'], attributes=['meterno', 'latitude', 'longitude']) hourlyreading_fact = BulkFactTable( name='smas_water_hourlyreading',
"activate_date", "deactivate_date", "version", "valid_from", "valid_to" ], lookupatts=["name"], versionatt="version", fromatt="valid_from", toatt="valid_to", srcdateatt="lastmoddate", cachesize=-1) time_dimension = CachedDimension( name='dim.time', key='time_id', attributes=[ 't_year', 't_month', 't_day', 't_hour', 'day_of_week', 'is_fall_semester', 'is_holiday', 't_timestamp' ], lookupatts=["t_year", "t_month", "t_day", "t_hour"] #rowexpander=time_rowexpander ) store_dimension = SlowlyChangingDimension( name="dim.store", key="store_id", attributes=["name", "description", "version", "valid_from", "valid_to"], lookupatts=["name"], versionatt="version", fromatt="valid_from", toatt="valid_to", srcdateatt="lastmoddate", cachesize=-1)
def run(self): with self.output().open() as f: """ # The actual database connection is handled using a PEP 249 connection pgconn = psycopg2.connect("""dbname='DwColegio' user='******'""") # This ConnectionWrapper will be set as default and is then implicitly used. # A reference to the wrapper is saved to allow for easy access of it later conn = pygrametl.ConnectionWrapper(connection=pgconn) dim_fecha = CachedDimension(name='DimFecha', key='id', attributes=['semestre', 'ano'], lookupatts=['semestre', 'ano'], prefill=True) dim_colegio = CachedDimension(name='DimColegio', key='id', attributes=['nombre', 'financiamiento'], lookupatts=['semestre', 'ano'], prefill=True) #create fecha table def duprange(f, l): for i in range(f, l): yield i yield i
'port': config.get('local_mssql', 'port'), 'database': config.get('wob_ggz', 'database') } cnx = sql.connect(**login) cur = cnx.cursor() connection = etl.ConnectionWrapper(cnx) connection.setasdefault() # define dimension object for ETL # Note that: # - pygrametl object table names are DIM_xxx, FCT_yyy # - MS SQL schema.table names are DIM.xxx, FCT.yyy DIM_AFSLUITREDEN = CachedDimension(name='DIM.AFSLUITREDEN', key='afs_id', attributes=['afs_afsluitreden_code'], size=0, prefill=True) DIM_CIRCUIT = CachedDimension(name='DIM.CIRCUIT', key='cct_id', attributes=['cct_circuit_code'], size=0, prefill=True) DIM_DAG = CachedDimension(name='DIM.DAG', key='dag_id', attributes=['dag_datum'], size=0, prefill=True)
# Take the 'www.domain.org' part from 'http://www.domain.org/page.html' # We also the host name ('www') in the domain in this example. domaininfo = row['url'].split('/')[-2] row['domain'] = domaininfo # Take the top level which is the last part of the domain row['topleveldomain'] = domaininfo.split('.')[-1] def extractserverinfo(row): # Find the server name from a string like "ServerName/Version" row['server'] = row['serverversion'].split('/')[0] # Dimension and fact table objects topleveldim = CachedDimension(name='topleveldomain', key='topleveldomainid', attributes=['topleveldomain']) domaindim = CachedDimension(name='domain', key='domainid', attributes=['domain', 'topleveldomainid'], lookupatts=['domain']) serverdim = CachedDimension(name='server', key='serverid', attributes=['server']) serverversiondim = CachedDimension(name='serverversion', key='serverversionid', attributes=['serverversion', 'serverid'])
# Connection to target DW: import MySQLdb myconn = MySQLdb.connect(user='******', passwd='hola',db='Estadisticas') connection=pygrametl.ConnectionWrapper(myconn) connection.setasdefault() def loader(name,atts,fieldsep,rowsep,nullval,filehandle): curs=MySQLConnection.cursor() curs.copy_from(file=filehandle,table=name,sep=fieldsep, null=str(nullval),columns=atts) #base de datos sgbstdn = CachedDimension( name='SGBSTDN', key = 'matricula', attributes = ['nombre','paterno', 'materno', 'degc_code','class_code'], lookupatts = ['matricula'] ) scbcrse = CachedDimension( name='SCBCRSE', key = 'cvemat', attributes = ['nommat','clase','lab','unidades'], lookupatts = ['cvemat'] ) ssbsect_algo = CachedDimension( name = 'SSBSECT', key = 'crn', attributes = ['cvemat', 'grupo', 'levl_code', 'coll_code', 'dept_code']
def load_dimensions(output_conn): dw_conn_wrapper = pygrametl.ConnectionWrapper(connection=output_conn) ret = dict() ret['dim_datetime'] = CachedDimension(name='dim_datetime', key='datetime_id', attributes=[ 'epoch', 'minute', 'minute_20', 'minute_30', 'hour', 'day_of_week', 'day_of_month', 'week', 'month', 'year', 'period' ], lookupatts=['epoch'], size=0, prefill=True, targetconnection=dw_conn_wrapper) ret['dim_location'] = TypeOneSlowlyChangingDimension( name='dim_location', key='location_id', attributes=[ 'lookup_location', 'initial_id', 'company_code', 'street', 'ward', 'district', 'city', 'area', 'country', 'level1flag', 'level2flag', 'level3flag', 'level4flag', 'level5flag', 'level6flag', ], lookupatts=['lookup_location'], cachesize=0, prefill=True, targetconnection=dw_conn_wrapper) ret['dim_employee'] = TypeOneSlowlyChangingDimension( name='dim_employee', key='employee_id', attributes=[ 'lookup_employee', 'initial_id', 'company_code', 'login', 'name', 'active', 'mobile', 'email' ], lookupatts=['lookup_employee'], cachesize=0, prefill=True, targetconnection=dw_conn_wrapper) ret['dim_partner'] = TypeOneSlowlyChangingDimension( name='dim_partner', key='partner_id', attributes=[ 'lookup_partner', 'initial_id', 'company_code', 'name', 'ref', 'is_company', 'active', 'customer', 'supplier', 'employee', 'state', 'seq', 'seq_order', 'street_id', 'classify', 'total_sh' ], lookupatts=['lookup_partner'], cachesize=0, prefill=True, targetconnection=dw_conn_wrapper) ret['dim_company'] = TypeOneSlowlyChangingDimension( name='dim_company', key='company_id', attributes=['company_code', 'company_name'], lookupatts=['company_code'], cachesize=0, prefill=True, targetconnection=dw_conn_wrapper) return ret
def __create_tables(self): # Systems self.system_dimension = CachedDimension(name='system', key='system_id', attributes=['system_name'], lookupatts=['system_name']) # Stations self.start_station_dimension = CachedDimension( name='start_station', key='start_station_id', attributes=[ 'system_id', 'start_station_short_name', 'start_station_name', 'start_station_latitude', 'start_station_longitude', 'start_station_capacity' ], lookupatts=['system_id', 'start_station_short_name'], rowexpander=start_station_missing_data_expander) self.end_station_dimension = CachedDimension( name='end_station', key='end_station_id', attributes=[ 'system_id', 'end_station_short_name', 'end_station_name', 'end_station_latitude', 'end_station_longitude', 'end_station_capacity' ], lookupatts=['system_id', 'end_station_short_name'], rowexpander=end_station_missing_data_expander) # Trip dates and times self.date_dimension = CachedDimension( name='bdate', key='date_id', attributes=['year', 'month', 'day', 'day_of_week', 'date_string'], lookupatts=['date_string'], rowexpander=date_row_expander) self.time_dimension = CachedDimension( name='btime', key='time_id', attributes=['hour', 'minute', 'time_string', 'time_of_day'], lookupatts=['time_string'], rowexpander=time_row_expander) # Trips self.trip_fact_table = FactTable( name='trips', measures=['duration_s'], keyrefs=[ 'system_id', 'start_station_id', 'end_station_id', 'date_id', 'time_id', 'customer_birthyear_id', 'customer_gender_id', 'customer_type_id', 'bike_id', 'trip_category_id' ]) # weather fact table and date dimension self.weather_fact_table = FactTable(name='weather', measures=[ 'precipitation_in', 'snow_in', 'temp_avg_f', 'temp_min_f', 'temp_max_f', 'wind_mph' ], keyrefs=['system_id', 'date_id']) self.trip_category = CachedDimension(name='trip_category', key='trip_category_id', attributes=['trip_category']) self.bike_dimension = CachedDimension( name='bikes', key='bike_id', attributes=['system_id', 'bike_name'], lookupatts=['system_id', 'bike_name'], defaultidvalue=-1) self.customer_gender_dimension = CachedDimension( name='customer_gender', key='customer_gender_id', attributes=['customer_gender'], lookupatts=['customer_gender']) self.customer_birthyear_dimension = CachedDimension( name='customer_birthyear', key='customer_birthyear_id', attributes=['customer_birthyear'], lookupatts=['customer_birthyear']) self.customer_type_dimension = CachedDimension( name='customer_type', key='customer_type_id', attributes=['customer_type'], lookupatts=['customer_type']) # Station status self.station_status_fact_table = FactTable( name='station_status', keyrefs=['system_id', 'start_station_id', 'date_id', 'time_id'], measures=['bikes_available', 'docks_available']) # Non-cached version of stations for use only with updating Indego stations. self.start_station_noncached_dimension = Dimension( name='start_station', key='start_station_id', attributes=[ 'system_id', 'start_station_short_name', 'start_station_name', 'start_station_latitude', 'start_station_longitude', 'start_station_capacity' ], lookupatts=['system_id', 'start_station_short_name'], rowexpander=start_station_missing_data_expander, defaultidvalue=-1) self.end_station_noncached_dimension = Dimension( name='end_station', key='end_station_id', attributes=[ 'system_id', 'end_station_short_name', 'end_station_name', 'end_station_latitude', 'end_station_longitude', 'end_station_capacity' ], lookupatts=['system_id', 'end_station_short_name'], rowexpander=end_station_missing_data_expander, defaultidvalue=-1)