def setup_scd(dbname, number): open(os.path.expanduser(dbname), 'w') output_conn = sqlite3.connect(dbname) output_cur = output_conn.cursor() output_cur.execute("CREATE TABLE scd " "(key INTEGER PRIMARY KEY, attr1 INTEGER, " "attr2 INTEGER, version INTEGER)") output_conn.commit() output_wrapper = pygrametl.ConnectionWrapper(connection=output_conn) scd = SlowlyChangingDimension( name='scd', key='key', attributes=['attr1', 'attr2', 'version'], lookupatts=['attr2'], versionatt='version' ) data = [] total_elapsed = 0. print('Generating scd data') start = time.monotonic() for i in range(1, number + 1): data.append({'attr1': i, 'attr2': number + 1 - i}) end = time.monotonic() elapsed = end - start print('Generated: {}{}'.format(round(elapsed, 3), 's')) total_elapsed += elapsed print('Inserting data into scd') start = time.monotonic() for row in data: scd.scdensure(row) end = time.monotonic() elapsed = end - start print('Inserted: {}{}'.format(round(elapsed, 3), 's')) total_elapsed += elapsed print('DW populated') print('Total time: {}{}\n'.format(round(total_elapsed, 3), 's')) output_conn.commit()
AGE INT NOT NULL, ADDRESS CHAR(50), VERSION INT);''') company_info = [('Anders', 43, 'Denmark', 1.0), ('Charles', 50, 'Texas', 1), ('Wolf', 28, 'Sweden', 1), ('Hannibal', 45, 'America', 1), ('Anders', 43, 'Denmark', 2.0)] # ... and inserting the necessary data. c.executemany( "INSERT INTO COMPANY (NAME,AGE,ADDRESS,VERSION) VALUES (?,?,?,?)", company_info) conn.commit() ConnectionWrapper(conn) s = SlowlyChangingDimension('COMPANY', 'ID', ['NAME', 'AGE', 'ADDRESS', 'VERSION'], ['NAME', 'AGE'], 'VERSION') dim = SCDType2DimRepresentation(s, conn) dw = DWRepresentation([dim], conn) a = SCDVersionPredicate('COMPANY', {'NAME': 'Anders', 'AGE': 43}, 2) b = SCDVersionPredicate('COMPANY', {'NAME': 'Anders', 'AGE': 43}, 3) print(a.run(dw)) print(b.run(dw)) conn.close()
book_conn = sqlite3.connect(book_path) country_handle = open(country_path, "r") wrapper = pygrametl.ConnectionWrapper(dw_conn) # Sources author_src = SQLSource(connection=author_conn, query="SELECT * FROM author") book_src = SQLSource(connection=book_conn, query="SELECT * FROM book") country_src = CSVSource(f=country_handle, delimiter=",") # Tables author_dim = Dimension(name="authordim", key="aid", attributes=["name", "city", "cid"]) book_dim = SlowlyChangingDimension( name="bookdim", key="bid", attributes=["title", "year", "version"], lookupatts=["title"], versionatt="version" ) country_dim = Dimension(name="countrydim", key="cid", attributes=["country"], lookupatts=["country"]) fact_table = FactTable(name="facttable", keyrefs=["aid", "bid"]) snowflake = SnowflakedDimension([(author_dim, country_dim)]) # We map cities to countries and populate the countrydim cid_map = {} for row in country_src: cid = country_dim.ensure(row) cid_map[row["city"]] = cid # We populate the authordim and the fact table
row['is_fall_semester'] = month < 2 or month >= 6 row['is_holiday'] = datetime.date(year, month, day) in dk_holidays row['t_timestamp'] = timestamp return row ### Dimensions and Facts ### product_dimension = SlowlyChangingDimension(name="dim.product", key="product_id", attributes=[ "name", "price", "alcohol_content_ml", "activate_date", "deactivate_date", "version", "valid_from", "valid_to" ], lookupatts=["name"], versionatt="version", fromatt="valid_from", toatt="valid_to", srcdateatt="lastmoddate", cachesize=-1) time_dimension = CachedDimension( name='dim.time', key='time_id', attributes=[ 't_year', 't_month', 't_day', 't_hour', 'day_of_week', 'is_fall_semester', 'is_holiday', 't_timestamp' ],
serverdim = CachedDimension(name='server', key='serverid', attributes=['server']) serverversiondim = CachedDimension(name='serverversion', key='serverversionid', attributes=['serverversion', 'serverid']) pagedim = SlowlyChangingDimension( name='page', key='pageid', attributes=[ 'url', 'size', 'validfrom', 'validto', 'version', 'domainid', 'serverversionid' ], lookupatts=['url'], versionatt='version', fromatt='validfrom', toatt='validto', #fromfinder=pygrametl.datereader('lastmoddate'),# API updated srcdateatt='lastmoddate', # this is new cachesize=-1) pagesf = SnowflakedDimension([(pagedim, (serverversiondim, domaindim)), (serverversiondim, serverdim), (domaindim, topleveldim)]) testdim = CachedDimension(name='test', key='testid', attributes=['testname', 'testauthor'], lookupatts=['testname'],
lookupatts=['domain']) serverdim = CachedDimension(name='server', key='serverid', attributes=['server']) serverversiondim = CachedDimension(name='serverversion', key='serverversionid', attributes=['serverversion', 'serverid']) pagedim = SlowlyChangingDimension(name='page', key='pageid', attributes=[ 'url', 'size', 'validfrom', 'validto', 'version', 'domainid', 'serverversionid' ], lookupatts=['url'], versionatt='version', fromatt='validfrom', toatt='validto', srcdateatt='lastmoddate', cachesize=-1) pagesf = SnowflakedDimension([(pagedim, (serverversiondim, domaindim)), (serverversiondim, serverdim), (domaindim, topleveldim)]) testdim = CachedDimension(name='test', key='testid', attributes=['testname', 'testauthor'], lookupatts=['testname'], prefill=True,