# Sources author_src = SQLSource(connection=author_conn, query="SELECT * FROM author") book_src = SQLSource(connection=book_conn, query="SELECT * FROM book") country_src = CSVSource(f=country_handle, delimiter=",") # Tables author_dim = Dimension(name="authordim", key="aid", attributes=["name", "city", "cid"]) book_dim = SlowlyChangingDimension( name="bookdim", key="bid", attributes=["title", "year", "version"], lookupatts=["title"], versionatt="version" ) country_dim = Dimension(name="countrydim", key="cid", attributes=["country"], lookupatts=["country"]) fact_table = FactTable(name="facttable", keyrefs=["aid", "bid"]) snowflake = SnowflakedDimension([(author_dim, country_dim)]) # We map cities to countries and populate the countrydim cid_map = {} for row in country_src: cid = country_dim.ensure(row) cid_map[row["city"]] = cid # We populate the authordim and the fact table for row in author_src: if row["city"] in ["Hadsten", "Skanderborg", "Kobenhavn"]: row["cid"] = cid_map[row["city"]] else: row["cid"] = None
def cls_to_pygram_fact(cls, schema_name): fct = FactTable( name=schema_name + '.' + cls.cls_get_name(), keyrefs=cls.cls_get_key_names(), measures=cls.cls_get_measure_names()) return fct
company_info = [('The Hobbit', 1), ('The Bobbit', 5)] # ... and inserting the necessary data. c.executemany("INSERT INTO FACTTABLE (BOOK,ISSUE) VALUES (?,?)", company_info) conn.commit() ConnectionWrapper(conn) dim = Dimension('COMPANY', 'ID', ['NAME', 'AGE', 'ADDRESS', 'SALARY'], ['NAME']) dim_rep = DimRepresentation(dim, conn) dim2 = Dimension('LASTNAME', 'NAME', ['LAST']) dim_rep2 = DimRepresentation(dim2, conn) ft = FactTable('FACTTABLE', ['Book'], ['Issue']) ft_rep = FTRepresentation(ft, conn) dw = DWRepresentation([dim_rep, ft_rep, dim_rep2], conn) expected_list1 = [ {'NAME': 'Anders', 'AGE': 43, 'SALARY': 21000.0, 'ADDRESS': 'Denmark', 'ID': 1}, {'NAME': 'Charles', 'AGE': 50, 'SALARY': 25000.0, 'ADDRESS': 'Texas', 'ID': 2}, {'NAME': 'Wolf', 'AGE': 28, 'SALARY': 19000.0, 'ADDRESS': 'Sweden', 'ID': 3}, {'NAME': 'Hannibal', 'AGE': 45, 'SALARY': 65000.0, 'ADDRESS': 'America', 'ID': 4}, {'NAME': 'Buggy', 'AGE': 67, 'SALARY': 2000.0, 'ADDRESS': 'America', 'ID': 5}
def setup(dbname, number): open(os.path.expanduser(dbname), 'w') output_conn = sqlite3.connect(dbname) output_cur = output_conn.cursor() output_cur.execute("CREATE TABLE dim1 " "(key1 INTEGER PRIMARY KEY, attr1 INTEGER, " "attr2 INTEGER)") output_cur.execute("CREATE TABLE dim2 " "(key2 INTEGER PRIMARY KEY, attr3 INTEGER, " "attr4 INTEGER)") output_cur.execute("CREATE TABLE ft1 " "(measure INTEGER, key1 INTEGER, key2 INTEGER, " "PRIMARY KEY(key1, key2))") output_conn.commit() output_wrapper = pygrametl.ConnectionWrapper(connection=output_conn) dim1 = Dimension( name='dim1', key='key1', attributes=['attr1', 'attr2'] ) dim2 = Dimension( name='dim2', key='key2', attributes=['attr3', 'attr4'], ) ft1 = FactTable( name='ft1', keyrefs=['key1', 'key2'], measures=['measure'] ) facts = [] dim1_data = [] dim2_data = [] total_elapsed = 0. print('Generating ft1 data') start = time.monotonic() counter = 0 for i in range(1, number + 1): for j in range(1, number + 1): counter += 10 facts.append({'key1': i, 'key2': j, 'measure': counter}) end = time.monotonic() elapsed = end - start print('Generated: {}{}'.format(round(elapsed, 3), 's')) total_elapsed += elapsed print('Generating dim1 data') start = time.monotonic() for i in range(1, number + 1): dim1_data.append({'attr1': i, 'attr2': number + 1 - i}) end = time.monotonic() elapsed = end - start print('Generated: {}{}'.format(round(elapsed, 3), 's')) total_elapsed += elapsed print('Generating dim2 data') start = time.monotonic() for i in range(1, number + 1): dim2_data.append({'attr3': i, 'attr4': number + 1 - i}) end = time.monotonic() elapsed = end - start print('Generated: {}{}'.format(round(elapsed, 3), 's')) total_elapsed += elapsed print('Inserting data into ft1') start = time.monotonic() for row in facts: ft1.insert(row) end = time.monotonic() elapsed = end - start print('Inserted: {}{}'.format(round(elapsed, 3), 's')) total_elapsed += elapsed print('Inserting data into dim1') start = time.monotonic() for row in dim1_data: dim1.insert(row) end = time.monotonic() elapsed = end - start print('Inserted: {}{}'.format(round(elapsed, 3), 's')) total_elapsed += elapsed print('Inserting data into dim2') start = time.monotonic() for row in dim2_data: dim2.insert(row) end = time.monotonic() elapsed = end - start print('Inserted: {}{}'.format(round(elapsed, 3), 's')) total_elapsed += elapsed print('DW populated') print('Total time: {}{}\n'.format(round(total_elapsed, 3), 's')) output_conn.commit()
key='bookid', attributes=['book', 'genre']) time_dimension = Dimension( name='time', key='timeid', attributes=['day', 'month', 'year']) location_dimension = Dimension( name='location', key='locationid', attributes=['city', 'region'], lookupatts=['city']) fact_table = FactTable( name='facttable', keyrefs=['bookid', 'locationid', 'timeid'], measures=['sale']) def split_timestamp(row): timestamp = row['timestamp'] timestamp_split = timestamp.split('/') row['year'] = timestamp_split[0] row['month'] = timestamp_split[1] row['day'] = timestamp_split[2] [location_dimension.insert(row) for row in region_source] region_file_handle.close() for row in sales_source: split_timestamp(row) row['bookid'] = book_dimension.ensure(row)
key='bookid', attributes=['book', 'genre']) time_dimension = Dimension( name='timeDim', key='timeid', attributes=['day', 'month', 'year']) location_dimension = Dimension( name='locationDim', key='locationid', attributes=['city', 'region'], lookupatts=['city']) fact_table = FactTable( name='factTable', keyrefs=['bookid', 'locationid', 'timeid'], measures=['sale']) # NOTE: Most of the following code is taken directly form pygrametl.org and has little to do with # making this example portable to SQLite and this particular implementation. It is however down here # that a lot of the cool stuff happens, i.e. the ETL stuff. # Python function needed to split the timestamp into its three parts def split_timestamp(row): """Splits a timestamp containing a date into its three parts """ # Splitting of the timestamp into parts timestamp = row['timestamp'] timestamp_split = timestamp.split('/')
""" A sample pygrametl program """ __author__ = 'Mathias Claus Jensen' import pygrametl from pygrametl.datasources import SQLSource from pygrametl.tables import Dimension, FactTable import sqlite3 input_conn = sqlite3.connect('input.db') output_conn = sqlite3.connect('output.db') input_src = SQLSource(input_conn, query='SELECT * FROM table') output_wrapper = pygrametl.ConnectionWrapper(connection=output_conn) dim1 = Dimension('dim1', 'key1', ['attr1', 'attr2']) dim2 = Dimension(name='dim2', key='key2', attributes=['attr3', 'attr4']) ft1 = FactTable(name='ft1', keyrefs=[ 'key1', ]) input_conn.close() output_conn.close()
def __create_tables(self): # Systems self.system_dimension = CachedDimension(name='system', key='system_id', attributes=['system_name'], lookupatts=['system_name']) # Stations self.start_station_dimension = CachedDimension( name='start_station', key='start_station_id', attributes=[ 'system_id', 'start_station_short_name', 'start_station_name', 'start_station_latitude', 'start_station_longitude', 'start_station_capacity' ], lookupatts=['system_id', 'start_station_short_name'], rowexpander=start_station_missing_data_expander) self.end_station_dimension = CachedDimension( name='end_station', key='end_station_id', attributes=[ 'system_id', 'end_station_short_name', 'end_station_name', 'end_station_latitude', 'end_station_longitude', 'end_station_capacity' ], lookupatts=['system_id', 'end_station_short_name'], rowexpander=end_station_missing_data_expander) # Trip dates and times self.date_dimension = CachedDimension( name='bdate', key='date_id', attributes=['year', 'month', 'day', 'day_of_week', 'date_string'], lookupatts=['date_string'], rowexpander=date_row_expander) self.time_dimension = CachedDimension( name='btime', key='time_id', attributes=['hour', 'minute', 'time_string', 'time_of_day'], lookupatts=['time_string'], rowexpander=time_row_expander) # Trips self.trip_fact_table = FactTable( name='trips', measures=['duration_s'], keyrefs=[ 'system_id', 'start_station_id', 'end_station_id', 'date_id', 'time_id', 'customer_birthyear_id', 'customer_gender_id', 'customer_type_id', 'bike_id', 'trip_category_id' ]) # weather fact table and date dimension self.weather_fact_table = FactTable(name='weather', measures=[ 'precipitation_in', 'snow_in', 'temp_avg_f', 'temp_min_f', 'temp_max_f', 'wind_mph' ], keyrefs=['system_id', 'date_id']) self.trip_category = CachedDimension(name='trip_category', key='trip_category_id', attributes=['trip_category']) self.bike_dimension = CachedDimension( name='bikes', key='bike_id', attributes=['system_id', 'bike_name'], lookupatts=['system_id', 'bike_name'], defaultidvalue=-1) self.customer_gender_dimension = CachedDimension( name='customer_gender', key='customer_gender_id', attributes=['customer_gender'], lookupatts=['customer_gender']) self.customer_birthyear_dimension = CachedDimension( name='customer_birthyear', key='customer_birthyear_id', attributes=['customer_birthyear'], lookupatts=['customer_birthyear']) self.customer_type_dimension = CachedDimension( name='customer_type', key='customer_type_id', attributes=['customer_type'], lookupatts=['customer_type']) # Station status self.station_status_fact_table = FactTable( name='station_status', keyrefs=['system_id', 'start_station_id', 'date_id', 'time_id'], measures=['bikes_available', 'docks_available']) # Non-cached version of stations for use only with updating Indego stations. self.start_station_noncached_dimension = Dimension( name='start_station', key='start_station_id', attributes=[ 'system_id', 'start_station_short_name', 'start_station_name', 'start_station_latitude', 'start_station_longitude', 'start_station_capacity' ], lookupatts=['system_id', 'start_station_short_name'], rowexpander=start_station_missing_data_expander, defaultidvalue=-1) self.end_station_noncached_dimension = Dimension( name='end_station', key='end_station_id', attributes=[ 'system_id', 'end_station_short_name', 'end_station_name', 'end_station_latitude', 'end_station_longitude', 'end_station_capacity' ], lookupatts=['system_id', 'end_station_short_name'], rowexpander=end_station_missing_data_expander, defaultidvalue=-1)