def load_cdr(self): call_dimension = Dimension( name='"DimCall"', key='uuid', attributes=[ "SipFromUser", 'SipToUser', 'HangupCause', 'Direction', 'SwitchName', 'CallerContext', 'IsAnswered', 'Duration', 'ObjClass', 'ObjType', 'ObjCategory', 'CompanyId', 'TenantId', 'AppId', 'AgentSkill', 'OriginatedLegs', 'DVPCallDirection', 'HangupDisposition', 'AgentAnswered', 'IsQueued', 'SipResource', 'BusinessUnit', 'CampaignId', 'CampaignName', 'ExtraData' ]) call_fact_table = FactTable( name='"FactCall"', keyrefs=[ 'uuid', 'CallUuid', 'BridgeUuid', 'CreatedDateDimId', 'AnsweredDateDimId', 'BridgedDateDimId', 'HangupDateDimId', 'CreatedTimeDimId', 'AnsweredTimeDimId', 'BridgedTimeDimId', 'HangupTimeDimId' ], measures=[ 'CreatedTime', 'AnsweredTime', 'BridgedTime', 'HangupTime', 'BillSec', 'HoldSec', 'ProgressSec', 'QueueSec', 'AnswerSec', 'WaitSec', 'ProgressMediaSec', 'FlowBillSec' ]) self.row['uuid'] = call_dimension.ensure(self.row) call_fact_table.insert(self.row)
def load_tickets(self): ticket_dimension = Dimension( name='"DimTicket"', key='ticket_id', attributes=[ 'active', 'tid', 'is_sub_ticket', 'subject', 'reference', 'description', 'priority', 'requester', 'submitter', 'collaborators', 'watchers', 'attachments', 'sub_tickets', 'related_tickets', 'merged_tickets', 'isolated_tags', 'sla', 'comments', 'tags', 'ticket_type', 'status_type', 'channel', 'company', 'tenant', 'assignee', 'assignee_group', 'security_level', 'event_author' ]) ticket_fact_table = FactTable( name='"FactTicket"', keyrefs=[ 'engagement_session_id', 'ticket_id', 'created_at_dim_id', 'updated_at_dim_id', 'due_at_dim_id', 'created_time_dim_id', 'updated_time_dim_id', 'due_time_dim_id' ], measures=[ 'created_at', 'updated_at', 'due_at', 'time_estimation', 're_opens', 'waited_time', 'worked_time', 'resolution_time' ]) # Updates the row with the primary keys of each dimension while at the same time inserting new data into # each dimension self.row['ticket_id'] = ticket_dimension.ensure(self.row) ticket_fact_table.insert(self.row)
def load_engagement(self): engagement_dimension = Dimension( name='"DimEngagement"', key='engagement_id', attributes=['profile', 'channel_from', 'channel_to', 'company', 'has_profile', 'tenant', 'notes', 'channel']) engagement_fact_table = FactTable( name='"FactEngagement"', keyrefs=['engagement_id', 'created_at_dim_id', 'updated_at_dim_id', 'created_time_dim_id', 'updated_time_dim_id'], measures=['updated_at', 'created_at']) self.row['eng_dim_id'] = engagement_dimension.ensure(self.row) engagement_fact_table.insert(self.row) return True
def load_ticket_events(self): ticket_event_dimension = Dimension( name='"DimTicketEvent"', key='ticket_event_id', attributes=['event_type', 'event_author', 'event_body']) ticket_event_fact_table = FactTable( name='"FactTicketEvent"', keyrefs=['ticket_id', 'ticket_event_id'], measures=['event_created_at_dim_id', 'event_created_at']) # Update the row with the primary keys of each dimension while at the same time inserting new data into # each dimension ticket_event_dimension.ensure(self.row) ticket_event_fact_table.insert(self.row)
def load_external_users(self): external_users_dimension = Dimension( name='"DimExternalUsers"', key='external_user_id', attributes=['title', 'name', 'avatar', 'birthday', 'gender', 'first_name', 'last_name', 'locale', 'ssn', 'password', 'phone', 'email', 'landnumber', 'facebook', 'twitter', 'linkedin', 'googleplus', 'skype', 'thirdpartyreference', 'company', 'tenant', 'custom_fields', 'tags', 'contacts', 'zipcode', 'address_number', 'city', 'province', 'country']) external_users_fact_table = FactTable( name='"FactExternalUsers"', keyrefs=['external_user_id', 'created_at_dim_id', 'updated_at_dim_id', 'created_time_dim_id', 'updated_time_dim_id'], measures=['created_at', 'updated_at']) self.row['external_user_id'] = external_users_dimension.ensure(self.row) external_users_fact_table.insert(self.row)
def cls_to_pygram_dim(cls, schema_name, lookup_fields=[]): # cls.cls_init_cols() # if not lookup_fields: lookup_fields = cls.cls_get_lookup_fields() if lookup_fields: dim = CachedDimension(name=schema_name + '.' + cls.cls_get_name(), key='id', attributes=cls.cls_get_column_names_no_id(), lookupatts=lookup_fields, cachefullrows=True) else: dim = Dimension(name=schema_name + '.' + cls.cls_get_name(), key='id', attributes=cls.cls_get_column_names_no_id()) return dim
def load_originated_legs(self): self.row2["uuid"] = self.uuid self.row2["CallUuid"] = self.call_uuid self.row2["MemberUuid"] = self.member_uuid self.row2["IsTransferLeg"] = self.is_transfer_leg self.row2["Direction"] = self.direction self.row2["ObjType"] = self.op_cat self.row2["originated_leg"] = self.originatedLegsList originated_legs_dimension = Dimension( name='"DimOriginatedLegs"', key='uuid', attributes=['CallUuid', 'Direction', 'MemberUuid', 'originated_leg', 'IsTransferLeg', 'ObjType']) originated_legs_dimension_temp = Dimension( name='"DimOriginatedLegsTemp"', key='uuid', attributes=['CallUuid', 'Direction', 'MemberUuid', 'originated_leg', 'IsTransferLeg', 'ObjType']) self.row2['uuid'] = originated_legs_dimension.ensure(self.row2) self.row2['uuid'] = originated_legs_dimension_temp.ensure(self.row2)
def convert_FullDate(row): # Convert FullDate to datetime return datetime.strptime(row['FullDate'], '%d/%m/%Y') def key_finder(row, namemapping): # Assign the primary key of DimDate return row[namemapping['FullDate']] # Create dimension and fact table abstractions for use in the ETL flow DimDate = Dimension( name='DimDate', key='DateKey', attributes=['FullDate', 'DateName', 'DayOfWeek', 'DayNameOfWeek', 'DayOfMonth', 'DayOfYear', 'WeekdayWeekend', \ 'WeekOfYear', 'MonthName', 'MonthOfYear', 'IsLastDayOfMonth', 'CalendarQuarter', 'CalendarYear', \ 'CalendarYearMonth', 'CalendarYearQtr', 'FiscalMonthOfYear', 'FiscalQuarter', 'FiscalYear', \ 'FiscalYearMonth', 'FiscalYearQtr'], lookupatts=['FullDate'], idfinder=key_finder ) DimGroup = TypeOneSlowlyChangingDimension( name='DimGroup', key='GroupKey', attributes=['GroupId', 'GroupName', 'Version'], lookupatts=['GroupId']) DimGroupCategory = Dimension( name='DimGroupCategory', key='GroupCategoryKey', attributes=['GroupCategoryId', 'GroupCategoryName'],
# Connections dw_conn = sqlite3.connect(dw_path) author_conn = sqlite3.connect(author_path) book_conn = sqlite3.connect(book_path) country_handle = open(country_path, "r") wrapper = pygrametl.ConnectionWrapper(dw_conn) # Sources author_src = SQLSource(connection=author_conn, query="SELECT * FROM author") book_src = SQLSource(connection=book_conn, query="SELECT * FROM book") country_src = CSVSource(f=country_handle, delimiter=",") # Tables author_dim = Dimension(name="authordim", key="aid", attributes=["name", "city", "cid"]) book_dim = SlowlyChangingDimension( name="bookdim", key="bid", attributes=["title", "year", "version"], lookupatts=["title"], versionatt="version" ) country_dim = Dimension(name="countrydim", key="cid", attributes=["country"], lookupatts=["country"]) fact_table = FactTable(name="facttable", keyrefs=["aid", "bid"]) snowflake = SnowflakedDimension([(author_dim, country_dim)]) # We map cities to countries and populate the countrydim cid_map = {} for row in country_src: cid = country_dim.ensure(row)
""" A sample pygrametl program """ __author__ = 'Mathias Claus Jensen' import pygrametl from pygrametl.datasources import SQLSource from pygrametl.tables import Dimension, FactTable import sqlite3 input_conn = sqlite3.connect('input.db') output_conn = sqlite3.connect('output.db') input_src = SQLSource(input_conn, query='SELECT * dim1') output_wrapper = pygrametl.ConnectionWrapper(connection=output_conn) dim1 = Dimension(name='dim1', key='key1', attributes=['attr1', 'attr2']) dim1 = Dimension(name='dim2', key='key2', attributes=['attr3', 'attr4']) ft1 = FactTable(name='ft1', keyrefs=[ 'key1', ]) input_conn.close() output_conn.close()
SALARY REAL);''') company_info = [('Anders', 43, 'Denmark', 21000.00), ('Charles', 50, 'Texas', 25000.00), ('Wolf', 28, 'Sweden', 19000.00), ('Hannibal', 45, 'America', 65000.00), ('Hannibal', 45, 'America', 65000.00), ('Buggy', 67, 'America', 2000), ('Buggy', 67, 'America', 2000), ('Buggy', 67, 'America', 2000), ('Buggy', 67, 'America', 2000), ('Buggy', 67, 'America', 2000), ('Buggy', 67, 'America', 2000)] # ... and inserting the necessary data. c.executemany("INSERT INTO COMPANY (NAME,AGE,ADDRESS,SALARY) VALUES (?,?,?,?)", company_info) conn.commit() columns = ['NAME', 'AGE', 'ADDRESS', 'SALARY'] wrapper = ConnectionWrapper(conn) dim = Dimension(name='COMPANY', key='ID', attributes=['NAME', 'AGE', 'ADDRESS', 'SALARY'], lookupatts=['NAME']) dim_rep = DimRepresentation(dim, conn) dw = DWRepresentation([dim_rep], conn) dup_predicate = DuplicatePredicate('company', ['ID'], True) print(dup_predicate.run(dw)) conn.close()
}, { 'attr1': 25 }, { 'attr1': None }, { 'attr1': 5 }, { 'attr1': None }, { 'attr1': None }] wrapper = pygrametl.ConnectionWrapper(connection=null_conn) dim1 = Dimension( name='dim1', key='key1', attributes=['attr1'], ) for row in data: dim1.insert(row) dim_rep = DimRepresentation(dim1, null_conn) notnull_tester = ColumnNotNullPredicate('dim1') null_rep = DWRepresentation([dim_rep], null_conn) print(notnull_tester.run(null_rep)) null_conn.close()
def setup(dbname, number): open(os.path.expanduser(dbname), 'w') output_conn = sqlite3.connect(dbname) output_cur = output_conn.cursor() output_cur.execute("CREATE TABLE dim1 " "(key1 INTEGER PRIMARY KEY, attr1 INTEGER, " "attr2 INTEGER)") output_cur.execute("CREATE TABLE dim2 " "(key2 INTEGER PRIMARY KEY, attr3 INTEGER, " "attr4 INTEGER)") output_cur.execute("CREATE TABLE ft1 " "(measure INTEGER, key1 INTEGER, key2 INTEGER, " "PRIMARY KEY(key1, key2))") output_conn.commit() output_wrapper = pygrametl.ConnectionWrapper(connection=output_conn) dim1 = Dimension( name='dim1', key='key1', attributes=['attr1', 'attr2'] ) dim2 = Dimension( name='dim2', key='key2', attributes=['attr3', 'attr4'], ) ft1 = FactTable( name='ft1', keyrefs=['key1', 'key2'], measures=['measure'] ) facts = [] dim1_data = [] dim2_data = [] total_elapsed = 0. print('Generating ft1 data') start = time.monotonic() counter = 0 for i in range(1, number + 1): for j in range(1, number + 1): counter += 10 facts.append({'key1': i, 'key2': j, 'measure': counter}) end = time.monotonic() elapsed = end - start print('Generated: {}{}'.format(round(elapsed, 3), 's')) total_elapsed += elapsed print('Generating dim1 data') start = time.monotonic() for i in range(1, number + 1): dim1_data.append({'attr1': i, 'attr2': number + 1 - i}) end = time.monotonic() elapsed = end - start print('Generated: {}{}'.format(round(elapsed, 3), 's')) total_elapsed += elapsed print('Generating dim2 data') start = time.monotonic() for i in range(1, number + 1): dim2_data.append({'attr3': i, 'attr4': number + 1 - i}) end = time.monotonic() elapsed = end - start print('Generated: {}{}'.format(round(elapsed, 3), 's')) total_elapsed += elapsed print('Inserting data into ft1') start = time.monotonic() for row in facts: ft1.insert(row) end = time.monotonic() elapsed = end - start print('Inserted: {}{}'.format(round(elapsed, 3), 's')) total_elapsed += elapsed print('Inserting data into dim1') start = time.monotonic() for row in dim1_data: dim1.insert(row) end = time.monotonic() elapsed = end - start print('Inserted: {}{}'.format(round(elapsed, 3), 's')) total_elapsed += elapsed print('Inserting data into dim2') start = time.monotonic() for row in dim2_data: dim2.insert(row) end = time.monotonic() elapsed = end - start print('Inserted: {}{}'.format(round(elapsed, 3), 's')) total_elapsed += elapsed print('DW populated') print('Total time: {}{}\n'.format(round(total_elapsed, 3), 's')) output_conn.commit()
region_source = CSVSource(f=csv_file_handle, delimiter=',') # We create and object for each dimension in the DW and the FactTable book_dimension = Dimension( name='bookDim', key='bookid', attributes=['book', 'genre']) time_dimension = Dimension( name='timeDim', key='timeid', attributes=['day', 'month', 'year']) location_dimension = Dimension( name='locationDim', key='locationid', attributes=['city', 'region'], lookupatts=['city']) fact_table = FactTable( name='factTable', keyrefs=['bookid', 'locationid', 'timeid'], measures=['sale']) # NOTE: Most of the following code is taken directly form pygrametl.org and has little to do with # making this example portable to SQLite and this particular implementation. It is however down here # that a lot of the cool stuff happens, i.e. the ETL stuff. # Python function needed to split the timestamp into its three parts # We put the regions into our location_dimension
sales_conn = sqlite3.connect(SALES_DB_NAME) dw_conn = sqlite3.connect(DW_NAME) # Wrapper for pygrametl, so that it now is in charge of our DW stuff dw_conn_wrapper = pygrametl.ConnectionWrapper(connection=dw_conn) # We make a source for our SQL and CSV # NOTE: We use NAME_MAPPING to do some mapping for us, not sure why. sales_source = SQLSource(connection=sales_conn, query="SELECT * FROM sales", names=NAME_MAPPING) csv_file_handle = open(CSV_NAME, "r") region_source = CSVSource(f=csv_file_handle, delimiter=',') # We create and object for each dimension in the DW and the FactTable book_dimension = Dimension( name='bookDim', key='bookid', attributes=['book', 'genre']) time_dimension = Dimension( name='timeDim', key='timeid', attributes=['day', 'month', 'year']) location_dimension = Dimension( name='locationDim', key='locationid', attributes=['city', 'region'], lookupatts=['city']) fact_table = FactTable( name='factTable',
{'attr1': None}, {'attr1': 15}, {'attr1': None}, {'attr1': None}, {'attr1': 1}, {'attr1': 25}, {'attr1': None}, {'attr1': 5}, {'attr1': None}, {'attr1': None} ] wrapper = pygrametl.ConnectionWrapper(connection=null_conn) dim1 = Dimension( name='dim1', key='key1', attributes=['attr1'], ) for row in data: dim1.insert(row) dim_rep = DimRepresentation(dim1, null_conn) notnull_tester = ColumnNotNullPredicate('dim1') null_rep = DWRepresentation([dim_rep], null_conn) print(notnull_tester.run(null_rep)) null_conn.close()
# handle to the file, as pygrametl uses Pythons DictReader for CSV files, # and the header of the CSV file contains information about each column. region_file_handle = open('c:\\work\\python\\region.csv', 'r', 16384) region_source = CSVSource(f=region_file_handle, delimiter=',') # An instance of Dimension is created for each dimension in the data # warehouse. For each table, the name of the table, the primary key of # the table, and a list of non key attributes in the table, are added. # In addition, for the location dimension we specify which attributes # should be used for a lookup of the primary key, as only the city is # present in the sales database and is enough to perform a lookup of # a unique primary key. As mentioned in the beginning of the guide, using # named parameters is strongly encouraged. book_dimension = Dimension( name='book', key='bookid', attributes=['book', 'genre']) time_dimension = Dimension( name='time', key='timeid', attributes=['day', 'month', 'year']) location_dimension = Dimension( name='location', key='locationid', attributes=['city', 'region'], lookupatts=['city']) # A single instance of FactTable is created for the data warehouse's # fact table, with the name of the table, a list of attributes constituting
# Making table to test on... c.execute('''CREATE TABLE FACTTABLE (Book TEXT NOT NULL, Issue INT )''') company_info = [('The Hobbit', 1), ('The Bobbit', 5)] # ... and inserting the necessary data. c.executemany("INSERT INTO FACTTABLE (BOOK,ISSUE) VALUES (?,?)", company_info) conn.commit() ConnectionWrapper(conn) dim = Dimension('COMPANY', 'ID', ['NAME', 'AGE', 'ADDRESS', 'SALARY'], ['NAME']) dim_rep = DimRepresentation(dim, conn) dim2 = Dimension('LASTNAME', 'NAME', ['LAST']) dim_rep2 = DimRepresentation(dim2, conn) ft = FactTable('FACTTABLE', ['Book'], ['Issue']) ft_rep = FTRepresentation(ft, conn) dw = DWRepresentation([dim_rep, ft_rep, dim_rep2], conn) expected_list1 = [ {'NAME': 'Anders', 'AGE': 43, 'SALARY': 21000.0, 'ADDRESS': 'Denmark', 'ID': 1}, {'NAME': 'Charles', 'AGE': 50, 'SALARY': 25000.0, 'ADDRESS': 'Texas', 'ID': 2},
}, { 'attr1': 75, 'attr2': 75, 'attr3': 75, 'attr4': 75 }, { 'attr1': 76, 'attr2': 76, 'attr3': 76, 'attr4': 76 }] wrapper = pygrametl.ConnectionWrapper(connection=conn) dim1 = Dimension(name='dim1', key='key1', attributes=['attr1', 'key2', 'key3'], lookupatts=['attr1']) dim2 = Dimension(name='dim2', key='key2', attributes=['attr2', 'key4'], lookupatts=['attr2']) dim3 = Dimension(name='dim3', key='key3', attributes=['attr3']) dim4 = Dimension(name='dim4', key='key4', attributes=['attr4']) special_snowflake = SnowflakedDimension(references=[(dim1, [dim2, dim3]), (dim2, dim4)])
def __create_tables(self): # Systems self.system_dimension = CachedDimension(name='system', key='system_id', attributes=['system_name'], lookupatts=['system_name']) # Stations self.start_station_dimension = CachedDimension( name='start_station', key='start_station_id', attributes=[ 'system_id', 'start_station_short_name', 'start_station_name', 'start_station_latitude', 'start_station_longitude', 'start_station_capacity' ], lookupatts=['system_id', 'start_station_short_name'], rowexpander=start_station_missing_data_expander) self.end_station_dimension = CachedDimension( name='end_station', key='end_station_id', attributes=[ 'system_id', 'end_station_short_name', 'end_station_name', 'end_station_latitude', 'end_station_longitude', 'end_station_capacity' ], lookupatts=['system_id', 'end_station_short_name'], rowexpander=end_station_missing_data_expander) # Trip dates and times self.date_dimension = CachedDimension( name='bdate', key='date_id', attributes=['year', 'month', 'day', 'day_of_week', 'date_string'], lookupatts=['date_string'], rowexpander=date_row_expander) self.time_dimension = CachedDimension( name='btime', key='time_id', attributes=['hour', 'minute', 'time_string', 'time_of_day'], lookupatts=['time_string'], rowexpander=time_row_expander) # Trips self.trip_fact_table = FactTable( name='trips', measures=['duration_s'], keyrefs=[ 'system_id', 'start_station_id', 'end_station_id', 'date_id', 'time_id', 'customer_birthyear_id', 'customer_gender_id', 'customer_type_id', 'bike_id', 'trip_category_id' ]) # weather fact table and date dimension self.weather_fact_table = FactTable(name='weather', measures=[ 'precipitation_in', 'snow_in', 'temp_avg_f', 'temp_min_f', 'temp_max_f', 'wind_mph' ], keyrefs=['system_id', 'date_id']) self.trip_category = CachedDimension(name='trip_category', key='trip_category_id', attributes=['trip_category']) self.bike_dimension = CachedDimension( name='bikes', key='bike_id', attributes=['system_id', 'bike_name'], lookupatts=['system_id', 'bike_name'], defaultidvalue=-1) self.customer_gender_dimension = CachedDimension( name='customer_gender', key='customer_gender_id', attributes=['customer_gender'], lookupatts=['customer_gender']) self.customer_birthyear_dimension = CachedDimension( name='customer_birthyear', key='customer_birthyear_id', attributes=['customer_birthyear'], lookupatts=['customer_birthyear']) self.customer_type_dimension = CachedDimension( name='customer_type', key='customer_type_id', attributes=['customer_type'], lookupatts=['customer_type']) # Station status self.station_status_fact_table = FactTable( name='station_status', keyrefs=['system_id', 'start_station_id', 'date_id', 'time_id'], measures=['bikes_available', 'docks_available']) # Non-cached version of stations for use only with updating Indego stations. self.start_station_noncached_dimension = Dimension( name='start_station', key='start_station_id', attributes=[ 'system_id', 'start_station_short_name', 'start_station_name', 'start_station_latitude', 'start_station_longitude', 'start_station_capacity' ], lookupatts=['system_id', 'start_station_short_name'], rowexpander=start_station_missing_data_expander, defaultidvalue=-1) self.end_station_noncached_dimension = Dimension( name='end_station', key='end_station_id', attributes=[ 'system_id', 'end_station_short_name', 'end_station_name', 'end_station_latitude', 'end_station_longitude', 'end_station_capacity' ], lookupatts=['system_id', 'end_station_short_name'], rowexpander=end_station_missing_data_expander, defaultidvalue=-1)