def load_ticket_events(self): ticket_event_dimension = Dimension( name='"DimTicketEvent"', key='ticket_event_id', attributes=['event_type', 'event_author', 'event_body']) ticket_event_fact_table = FactTable( name='"FactTicketEvent"', keyrefs=['ticket_id', 'ticket_event_id'], measures=['event_created_at_dim_id', 'event_created_at']) # Update the row with the primary keys of each dimension while at the same time inserting new data into # each dimension ticket_event_dimension.ensure(self.row) ticket_event_fact_table.insert(self.row)
def load_cdr(self): call_dimension = Dimension( name='"DimCall"', key='uuid', attributes=[ "SipFromUser", 'SipToUser', 'HangupCause', 'Direction', 'SwitchName', 'CallerContext', 'IsAnswered', 'Duration', 'ObjClass', 'ObjType', 'ObjCategory', 'CompanyId', 'TenantId', 'AppId', 'AgentSkill', 'OriginatedLegs', 'DVPCallDirection', 'HangupDisposition', 'AgentAnswered', 'IsQueued', 'SipResource', 'BusinessUnit', 'CampaignId', 'CampaignName', 'ExtraData' ]) call_fact_table = FactTable( name='"FactCall"', keyrefs=[ 'uuid', 'CallUuid', 'BridgeUuid', 'CreatedDateDimId', 'AnsweredDateDimId', 'BridgedDateDimId', 'HangupDateDimId', 'CreatedTimeDimId', 'AnsweredTimeDimId', 'BridgedTimeDimId', 'HangupTimeDimId' ], measures=[ 'CreatedTime', 'AnsweredTime', 'BridgedTime', 'HangupTime', 'BillSec', 'HoldSec', 'ProgressSec', 'QueueSec', 'AnswerSec', 'WaitSec', 'ProgressMediaSec', 'FlowBillSec' ]) self.row['uuid'] = call_dimension.ensure(self.row) call_fact_table.insert(self.row)
def load_tickets(self): ticket_dimension = Dimension( name='"DimTicket"', key='ticket_id', attributes=[ 'active', 'tid', 'is_sub_ticket', 'subject', 'reference', 'description', 'priority', 'requester', 'submitter', 'collaborators', 'watchers', 'attachments', 'sub_tickets', 'related_tickets', 'merged_tickets', 'isolated_tags', 'sla', 'comments', 'tags', 'ticket_type', 'status_type', 'channel', 'company', 'tenant', 'assignee', 'assignee_group', 'security_level', 'event_author' ]) ticket_fact_table = FactTable( name='"FactTicket"', keyrefs=[ 'engagement_session_id', 'ticket_id', 'created_at_dim_id', 'updated_at_dim_id', 'due_at_dim_id', 'created_time_dim_id', 'updated_time_dim_id', 'due_time_dim_id' ], measures=[ 'created_at', 'updated_at', 'due_at', 'time_estimation', 're_opens', 'waited_time', 'worked_time', 'resolution_time' ]) # Updates the row with the primary keys of each dimension while at the same time inserting new data into # each dimension self.row['ticket_id'] = ticket_dimension.ensure(self.row) ticket_fact_table.insert(self.row)
def load_originated_legs(self): self.row2["uuid"] = self.uuid self.row2["CallUuid"] = self.call_uuid self.row2["MemberUuid"] = self.member_uuid self.row2["IsTransferLeg"] = self.is_transfer_leg self.row2["Direction"] = self.direction self.row2["ObjType"] = self.op_cat self.row2["originated_leg"] = self.originatedLegsList originated_legs_dimension = Dimension( name='"DimOriginatedLegs"', key='uuid', attributes=['CallUuid', 'Direction', 'MemberUuid', 'originated_leg', 'IsTransferLeg', 'ObjType']) originated_legs_dimension_temp = Dimension( name='"DimOriginatedLegsTemp"', key='uuid', attributes=['CallUuid', 'Direction', 'MemberUuid', 'originated_leg', 'IsTransferLeg', 'ObjType']) self.row2['uuid'] = originated_legs_dimension.ensure(self.row2) self.row2['uuid'] = originated_legs_dimension_temp.ensure(self.row2)
def load_engagement(self): engagement_dimension = Dimension( name='"DimEngagement"', key='engagement_id', attributes=['profile', 'channel_from', 'channel_to', 'company', 'has_profile', 'tenant', 'notes', 'channel']) engagement_fact_table = FactTable( name='"FactEngagement"', keyrefs=['engagement_id', 'created_at_dim_id', 'updated_at_dim_id', 'created_time_dim_id', 'updated_time_dim_id'], measures=['updated_at', 'created_at']) self.row['eng_dim_id'] = engagement_dimension.ensure(self.row) engagement_fact_table.insert(self.row) return True
def load_external_users(self): external_users_dimension = Dimension( name='"DimExternalUsers"', key='external_user_id', attributes=['title', 'name', 'avatar', 'birthday', 'gender', 'first_name', 'last_name', 'locale', 'ssn', 'password', 'phone', 'email', 'landnumber', 'facebook', 'twitter', 'linkedin', 'googleplus', 'skype', 'thirdpartyreference', 'company', 'tenant', 'custom_fields', 'tags', 'contacts', 'zipcode', 'address_number', 'city', 'province', 'country']) external_users_fact_table = FactTable( name='"FactExternalUsers"', keyrefs=['external_user_id', 'created_at_dim_id', 'updated_at_dim_id', 'created_time_dim_id', 'updated_time_dim_id'], measures=['created_at', 'updated_at']) self.row['external_user_id'] = external_users_dimension.ensure(self.row) external_users_fact_table.insert(self.row)
# As all the information needed for the other dimensions are stored in the # sales database, we can loop through all the rows in it, split the timestamp # into its three parts, and lookup the three dimension keys needed for # the fact table while letting pygrametl update each dimension with the # necessary data using Dimension.ensure(). Using this method instead of # insert combines a lookup with a insertion so a new row only is inserted # into the dimension or fact table, if it does not yet exist. for row in sales_source: # The timestamp is split into its three parts split_timestamp(row) # We update the row with the primary keys of each dimension while at # the same time inserting new data into each dimension row['bookid'] = book_dimension.ensure(row) row['timeid'] = time_dimension.ensure(row) # We do not use ensure() for the location dimension, as city # and region information of all stores has already been loaded into # the table, and having a sale from a store we do not know about is # probably either an error in the sales or region data. We use lookup # instead which does not insert data and returns None, if no row with # the requested data is available, allowing for simple implementation # of error handling in ETL flow, which is shown here as an exception row['locationid'] = location_dimension.lookup(row) # A simple example of how to check if a lookup was successful, so # errors can be handled in some way. Here we just give up, and throw # an error. if not row['locationid']:
Fact_WorkAttendance = AccumulatingSnapshotFactTable( name='Fact_WorkAttendance', keyrefs=['CandidateKey', 'GroupKey', 'DateKey'], otherrefs=['Version', 'TimeIn', 'TimeOut'], measures=['TotalHoursWorked']) Fact_CandidateConversation = AccumulatingSnapshotFactTable( name='Fact_CandidateConversation', keyrefs=['CandidateKey', 'ConversationDateKey', 'ConversationId'], otherrefs=['Version', 'ConversationRole'], measures=['Rating']) # Load the dimension tables for row in DimDate_source: row['FullDate'] = convert_FullDate(row) DimDate.ensure(row, namemapping={'DateKey': 'DateKey'}) for row in DimGroup_source: row['Version'] = CS.VersionNumber DimGroup.scdensure(row) for row in DimGroupCategory_source: DimGroupCategory.ensure(row) for row in DimRole_source: DimRole.ensure(row) for row in DimCandidate_source: row['Version'] = CS.VersionNumber DimCandidate.scdensure(row)
author_dim = Dimension(name="authordim", key="aid", attributes=["name", "city", "cid"]) book_dim = SlowlyChangingDimension( name="bookdim", key="bid", attributes=["title", "year", "version"], lookupatts=["title"], versionatt="version" ) country_dim = Dimension(name="countrydim", key="cid", attributes=["country"], lookupatts=["country"]) fact_table = FactTable(name="facttable", keyrefs=["aid", "bid"]) snowflake = SnowflakedDimension([(author_dim, country_dim)]) # We map cities to countries and populate the countrydim cid_map = {} for row in country_src: cid = country_dim.ensure(row) cid_map[row["city"]] = cid # We populate the authordim and the fact table for row in author_src: if row["city"] in ["Hadsten", "Skanderborg", "Kobenhavn"]: row["cid"] = cid_map[row["city"]] else: row["cid"] = None row["name"] = row["firstname"] + " " + row["lastname"] row.pop("aid", 0) # Gets rid of aid so that pygrametl can generate them # Placing new row in author_dim row["aid"] = author_dim.ensure(row) # Placing new row in fact_table
row['month'] = timestamp_split[1] row['day'] = timestamp_split[2] # We put the regions into our location_dimension [location_dimension.insert(row) for row in region_source] csv_file_handle.close() # Each row in the sales database is iterated through and inserted for row in sales_source: # Each row is passed to the timestamp split function for splitting split_timestamp(row) # Lookups are performed to find the key in each dimension for the fact # and if the data is not there, it is inserted from the sales row row['bookid'] = book_dimension.ensure(row) row['timeid'] = time_dimension.ensure(row) # For the location dimension, all the data is already present, so a # missing row must be an error row['locationid'] = location_dimension.lookup(row) if not row['locationid']: raise ValueError("city was not present in the location dimension") # The row can then be inserted into the fact table fact_table.insert(row) # The data warehouse connection is then ordered to commit and close dw_conn_wrapper.commit() dw_conn_wrapper.close()