}, { 'attr1': 25 }, { 'attr1': None }, { 'attr1': 5 }, { 'attr1': None }, { 'attr1': None }] wrapper = pygrametl.ConnectionWrapper(connection=null_conn) dim1 = Dimension( name='dim1', key='key1', attributes=['attr1'], ) for row in data: dim1.insert(row) dim_rep = DimRepresentation(dim1, null_conn) notnull_tester = ColumnNotNullPredicate('dim1') null_rep = DWRepresentation([dim_rep], null_conn) print(notnull_tester.run(null_rep)) null_conn.close()
# Finally each part is reassigned to the row dictionary. It can then be # accessed by the caller as the row is a reference to the dict object row['year'] = timestamp_split[0] row['month'] = timestamp_split[1] row['day'] = timestamp_split[2] # The Location dimension is filled with data from the CSV file, as the file # contains information for both columns in the table. If the dimension was # filled using the sales database, it would be necessary to update the # region attribute with data from the CSV file later irregardless. # To perform the insertion, the method Dimension.insert() is used which # inserts a row into the table, and the connection wrapper is asked to # commit to ensure that the data is present in the database to allow for # lookups of keys for the fact table [location_dimension.insert(row) for row in region_source] # The file handle for the CSV file can then be closed region_file_handle.close() # As all the information needed for the other dimensions are stored in the # sales database, we can loop through all the rows in it, split the timestamp # into its three parts, and lookup the three dimension keys needed for # the fact table while letting pygrametl update each dimension with the # necessary data using Dimension.ensure(). Using this method instead of # insert combines a lookup with a insertion so a new row only is inserted # into the dimension or fact table, if it does not yet exist. for row in sales_source: # The timestamp is split into its three parts split_timestamp(row)
def setup(dbname, number): open(os.path.expanduser(dbname), 'w') output_conn = sqlite3.connect(dbname) output_cur = output_conn.cursor() output_cur.execute("CREATE TABLE dim1 " "(key1 INTEGER PRIMARY KEY, attr1 INTEGER, " "attr2 INTEGER)") output_cur.execute("CREATE TABLE dim2 " "(key2 INTEGER PRIMARY KEY, attr3 INTEGER, " "attr4 INTEGER)") output_cur.execute("CREATE TABLE ft1 " "(measure INTEGER, key1 INTEGER, key2 INTEGER, " "PRIMARY KEY(key1, key2))") output_conn.commit() output_wrapper = pygrametl.ConnectionWrapper(connection=output_conn) dim1 = Dimension( name='dim1', key='key1', attributes=['attr1', 'attr2'] ) dim2 = Dimension( name='dim2', key='key2', attributes=['attr3', 'attr4'], ) ft1 = FactTable( name='ft1', keyrefs=['key1', 'key2'], measures=['measure'] ) facts = [] dim1_data = [] dim2_data = [] total_elapsed = 0. print('Generating ft1 data') start = time.monotonic() counter = 0 for i in range(1, number + 1): for j in range(1, number + 1): counter += 10 facts.append({'key1': i, 'key2': j, 'measure': counter}) end = time.monotonic() elapsed = end - start print('Generated: {}{}'.format(round(elapsed, 3), 's')) total_elapsed += elapsed print('Generating dim1 data') start = time.monotonic() for i in range(1, number + 1): dim1_data.append({'attr1': i, 'attr2': number + 1 - i}) end = time.monotonic() elapsed = end - start print('Generated: {}{}'.format(round(elapsed, 3), 's')) total_elapsed += elapsed print('Generating dim2 data') start = time.monotonic() for i in range(1, number + 1): dim2_data.append({'attr3': i, 'attr4': number + 1 - i}) end = time.monotonic() elapsed = end - start print('Generated: {}{}'.format(round(elapsed, 3), 's')) total_elapsed += elapsed print('Inserting data into ft1') start = time.monotonic() for row in facts: ft1.insert(row) end = time.monotonic() elapsed = end - start print('Inserted: {}{}'.format(round(elapsed, 3), 's')) total_elapsed += elapsed print('Inserting data into dim1') start = time.monotonic() for row in dim1_data: dim1.insert(row) end = time.monotonic() elapsed = end - start print('Inserted: {}{}'.format(round(elapsed, 3), 's')) total_elapsed += elapsed print('Inserting data into dim2') start = time.monotonic() for row in dim2_data: dim2.insert(row) end = time.monotonic() elapsed = end - start print('Inserted: {}{}'.format(round(elapsed, 3), 's')) total_elapsed += elapsed print('DW populated') print('Total time: {}{}\n'.format(round(total_elapsed, 3), 's')) output_conn.commit()
{'attr1': None}, {'attr1': 15}, {'attr1': None}, {'attr1': None}, {'attr1': 1}, {'attr1': 25}, {'attr1': None}, {'attr1': 5}, {'attr1': None}, {'attr1': None} ] wrapper = pygrametl.ConnectionWrapper(connection=null_conn) dim1 = Dimension( name='dim1', key='key1', attributes=['attr1'], ) for row in data: dim1.insert(row) dim_rep = DimRepresentation(dim1, null_conn) notnull_tester = ColumnNotNullPredicate('dim1') null_rep = DWRepresentation([dim_rep], null_conn) print(notnull_tester.run(null_rep)) null_conn.close()
attributes=['city', 'region'], lookupatts=['city']) fact_table = FactTable( name='factTable', keyrefs=['bookid', 'locationid', 'timeid'], measures=['sale']) # NOTE: Most of the following code is taken directly form pygrametl.org and has little to do with # making this example portable to SQLite and this particular implementation. It is however down here # that a lot of the cool stuff happens, i.e. the ETL stuff. # Python function needed to split the timestamp into its three parts # We put the regions into our location_dimension [location_dimension.insert(row) for row in region_source] csv_file_handle.close() # Here are all of our steps. # They make the necessary transformations so that we can load our data into the DW # Each row in the source is processed through the step chain one at a time. # Beginning of chain for iterating over rows step_starter = SourceStep(sales_source) # Calls the time_splitter function on a row time_splitter = Step(split_timestamp) # These steps ensure that an entry related to our row is in the given dimension. If not it is inserted. # Afterwards the primary key of the given entry is inserted into the row. ensure_book = DimensionStep(dimension=book_dimension, keyfield='bookid')