Exemple #1
0
for row in DimBadge_source:
    row['Version'] = CS.VersionNumber
    DimBadge.scdensure(row)

for row in DimLocation_source:
    DimLocation.scdensure(row)

for row in DimLevel_source:
    DimLevel.ensure(row)

# Load the fact tables
for row in Fact_CandidateGroup_source:
    row['CandidateKey'] = DimCandidate.lookup(row)
    row['GroupKey'] = DimGroup.lookup(row)
    row['GroupCategoryKey'] = DimGroupCategory.lookup(row)
    row['RoleKey'] = DimRole.lookup(row)
    row['StartDate'] = row['StartDate'].date()
    row['ExpiryDate'] = row['ExpiryDate'].date()
    row['StartDateKey'] = DimDate.lookup(row,
                                         namemapping={'FullDate': 'StartDate'})
    row['ExpiryDateKey'] = DimDate.lookup(
        row, namemapping={'FullDate': 'ExpiryDate'})
    row['Version'] = CS.VersionNumber
    Fact_CandidateGroup.ensure(row)

for row in Fact_CandidateMeetingAttendance_source:
    row['CandidateKey'] = DimCandidate.lookup(row)
    row['MeetingTypeKey'] = DimMeetingType.lookup(row)
    row['MeetingDateKey'] = DimDate.lookup(
        row, namemapping={'FullDate': 'MeetingDate'})
    # The timestamp is split into its three parts
    split_timestamp(row)

    # We update the row with the primary keys of each dimension while at
    # the same time inserting new data into each dimension
    row['bookid'] = book_dimension.ensure(row)
    row['timeid'] = time_dimension.ensure(row)

    # We do not use ensure() for the location dimension, as city
    # and region information of all stores has already been loaded into
    # the table, and having a sale from a store we do not know about is
    # probably either an error in the sales or region data. We use lookup
    # instead which does not insert data and returns None, if no row with
    # the requested data is available, allowing for simple implementation
    # of error handling in ETL flow, which is shown here as an exception
    row['locationid'] = location_dimension.lookup(row)

    # A simple example of how to check if a lookup was successful, so
    # errors can be handled in some way. Here we just give up, and throw
    # an error.
    if not row['locationid']:
       raise ValueError("City was not present in the location dimension")

    # As the number of sales was already conveniently aggregated in the
    # sales table, the row can now be inserted into the data warehouse as
    # we have all the IDs we need. If aggregations, or other more advanced
    # manipulation is required, the full power Python is available as shown
    # with the call to the split_timestamp(row) function.
    fact_table.insert(row)

# After all the data is inserted, we close the connection in order to
Exemple #3
0
# We put the regions into our location_dimension
[location_dimension.insert(row) for row in region_source]
csv_file_handle.close()

# Each row in the sales database is iterated through and inserted
for row in sales_source:

    # Each row is passed to the timestamp split function for splitting
    split_timestamp(row)

    # Lookups are performed to find the key in each dimension for the fact
    # and if the data is not there, it is inserted from the sales row
    row['bookid'] = book_dimension.ensure(row)
    row['timeid'] = time_dimension.ensure(row)

    # For the location dimension, all the data is already present, so a
    # missing row must be an error
    row['locationid'] = location_dimension.lookup(row)
    if not row['locationid']:
       raise ValueError("city was not present in the location dimension")

    # The row can then be inserted into the fact table
    fact_table.insert(row)

# The data warehouse connection is then ordered to commit and close
dw_conn_wrapper.commit()
dw_conn_wrapper.close()

sales_conn.close()