for row in DimBadge_source: row['Version'] = CS.VersionNumber DimBadge.scdensure(row) for row in DimLocation_source: DimLocation.scdensure(row) for row in DimLevel_source: DimLevel.ensure(row) # Load the fact tables for row in Fact_CandidateGroup_source: row['CandidateKey'] = DimCandidate.lookup(row) row['GroupKey'] = DimGroup.lookup(row) row['GroupCategoryKey'] = DimGroupCategory.lookup(row) row['RoleKey'] = DimRole.lookup(row) row['StartDate'] = row['StartDate'].date() row['ExpiryDate'] = row['ExpiryDate'].date() row['StartDateKey'] = DimDate.lookup(row, namemapping={'FullDate': 'StartDate'}) row['ExpiryDateKey'] = DimDate.lookup( row, namemapping={'FullDate': 'ExpiryDate'}) row['Version'] = CS.VersionNumber Fact_CandidateGroup.ensure(row) for row in Fact_CandidateMeetingAttendance_source: row['CandidateKey'] = DimCandidate.lookup(row) row['MeetingTypeKey'] = DimMeetingType.lookup(row) row['MeetingDateKey'] = DimDate.lookup( row, namemapping={'FullDate': 'MeetingDate'})
# The timestamp is split into its three parts split_timestamp(row) # We update the row with the primary keys of each dimension while at # the same time inserting new data into each dimension row['bookid'] = book_dimension.ensure(row) row['timeid'] = time_dimension.ensure(row) # We do not use ensure() for the location dimension, as city # and region information of all stores has already been loaded into # the table, and having a sale from a store we do not know about is # probably either an error in the sales or region data. We use lookup # instead which does not insert data and returns None, if no row with # the requested data is available, allowing for simple implementation # of error handling in ETL flow, which is shown here as an exception row['locationid'] = location_dimension.lookup(row) # A simple example of how to check if a lookup was successful, so # errors can be handled in some way. Here we just give up, and throw # an error. if not row['locationid']: raise ValueError("City was not present in the location dimension") # As the number of sales was already conveniently aggregated in the # sales table, the row can now be inserted into the data warehouse as # we have all the IDs we need. If aggregations, or other more advanced # manipulation is required, the full power Python is available as shown # with the call to the split_timestamp(row) function. fact_table.insert(row) # After all the data is inserted, we close the connection in order to
# We put the regions into our location_dimension [location_dimension.insert(row) for row in region_source] csv_file_handle.close() # Each row in the sales database is iterated through and inserted for row in sales_source: # Each row is passed to the timestamp split function for splitting split_timestamp(row) # Lookups are performed to find the key in each dimension for the fact # and if the data is not there, it is inserted from the sales row row['bookid'] = book_dimension.ensure(row) row['timeid'] = time_dimension.ensure(row) # For the location dimension, all the data is already present, so a # missing row must be an error row['locationid'] = location_dimension.lookup(row) if not row['locationid']: raise ValueError("city was not present in the location dimension") # The row can then be inserted into the fact table fact_table.insert(row) # The data warehouse connection is then ordered to commit and close dw_conn_wrapper.commit() dw_conn_wrapper.close() sales_conn.close()