'attr2': 74, 'attr3': 74, 'attr4': 74 }, { 'attr1': 75, 'attr2': 75, 'attr3': 75, 'attr4': 75 }, { 'attr1': 76, 'attr2': 76, 'attr3': 76, 'attr4': 76 }] wrapper = pygrametl.ConnectionWrapper(connection=conn) dim1 = Dimension(name='dim1', key='key1', attributes=['attr1', 'key2', 'key3'], lookupatts=['attr1']) dim2 = Dimension(name='dim2', key='key2', attributes=['attr2', 'key4'], lookupatts=['attr2']) dim3 = Dimension(name='dim3', key='key3', attributes=['attr3']) dim4 = Dimension(name='dim4', key='key4', attributes=['attr4'])
import sys dir = sys.path[0] # cesta k lib sys.path.append(dir + '\\lib') import pygrametl, MySQLdb from pygrametl.datasources import * from pygrametl.tables import * mysql_conn_target = MySQLdb.connect(host='localhost', user='******', passwd='', db='dwh') mysql_conn_source = MySQLdb.connect(host='localhost', user='******', passwd='', db='karty') mysql_conn_source.set_character_set('utf8') mysql_conn_target.set_character_set('utf8') conn_source = pygrametl.ConnectionWrapper(mysql_conn_source) conn_target = pygrametl.ConnectionWrapper(mysql_conn_target) query_target = 'TRUNCATE TABLE facilitytype' conn_target.execute( query_target ) conn_target.commit() # zdroj dat pre dimenziu query_source = 'SELECT id, name, description FROM facilitytype' facilitytype_source = SQLSource(connection=conn_source, query=query_source, names=(), initsql=None, cursorarg=None) # dimenzia facility_dim = CachedDimension( targetconnection = conn_target, name='facilitytype', key='id',
DB_NAME='csi4142' DB_USER='******' DB_HOST='localhost' DB_PASS='' # Global variables used for in-memory stores. POP_DATA = {} LIFE_EXPECTANCY_DATA = {} GNI_DATA = {} NUTRITION_DATA = {} # Connection to the target data warehouse: pgconn = psycopg2.connect(dbname=DB_NAME, user=DB_USER, host=DB_HOST, password=DB_PASS) connection = pygrametl.ConnectionWrapper(pgconn) connection.setasdefault() connection.execute('set search_path to csi4142project') # Methods def pgcopybulkloader(name, atts, fieldsep, rowsep, nullval, filehandle): # Here we use driver-specific code to get fast bulk loading. # You can change this method if you use another driver or you can # use the FactTable or BatchFactTable classes (which don't require # use of driver-specifc code) instead of the BulkFactTable class. global connection curs = connection.cursor() curs.copy_from(file=filehandle, table=name, sep=fieldsep, null=str(nullval), columns=atts)
'attr1': None }, { 'attr1': 1 }, { 'attr1': 25 }, { 'attr1': None }, { 'attr1': 5 }, { 'attr1': None }, { 'attr1': None }] wrapper = pygrametl.ConnectionWrapper(connection=null_conn) dim1 = Dimension( name='dim1', key='key1', attributes=['attr1'], ) for row in data: dim1.insert(row) dim_rep = DimRepresentation(dim1, null_conn) notnull_tester = ColumnNotNullPredicate('dim1') null_rep = DWRepresentation([dim_rep], null_conn) print(notnull_tester.run(null_rep))
from pygrametl.tables import Dimension, FactTable # Creation of a database connection to the sales database with a simple # connection string, specifying the necessary host, username and passowrd sales_string = "host='localhost' dbname='source' user='******' password='******' port=54320" sales_pgconn = psycopg2.connect(sales_string) # A connection is also created for the data warehouse. The connection is # then given to a ConnectionWrapper for it to implicitly shared between # all the pygrametl abstractions that needs it with being passed around dw_string = "host='localhost' dbname='etl' user='******' password='******' port=54320" dw_pgconn = psycopg2.connect(dw_string) # Although the ConnectionWrapper is shared automatically between pygrametl # abstractions, we still save in in a variable to allow for it to be closed dw_conn_wrapper = pygrametl.ConnectionWrapper(connection=dw_pgconn) # As the location dimension stores the name of a location in the attribute # "city" instead of in the attribute "store" as in the input data from the # sales relation, a sequence of names matching the number of attributes in # the relation is created, allowing the SQLSource to do the mapping for us name_mapping= 'book', 'genre', 'city', 'timestamp', 'sale' # Extraction of rows from a database using a PEP 249 connection and SQL sales_source = SQLSource(connection=sales_pgconn, \ query="SELECT * FROM sales", names=name_mapping) # Extraction of rows from a CSV file does not require SQL, just an open file # handle to the file, as pygrametl uses Pythons DictReader for CSV files, # and the header of the CSV file contains information about each column. region_file_handle = open('c:\\work\\python\\region.csv', 'r', 16384)
# setup connection to database config = configparser.ConfigParser() config.read('/opt/projects/wob_zz/config.ini') login = { 'user': config.get('local_mssql', 'user'), 'password': config.get('local_mssql', 'password'), 'server': config.get('local_mssql', 'server'), 'port': config.get('local_mssql', 'port'), 'database': config.get('wob_zz', 'database') } cnx = sql.connect(**login) cur = cnx.cursor() connection = etl.ConnectionWrapper(cnx) connection.setasdefault() # define dimension object for ETL # Note that: # - pygrametl object table names are DIM_xxx, FCT_yyy # - MS SQL schema.table names are DIM.xxx, FCT.yyy DIM_AFSLUITREDEN = CachedDimension( name='DIM.AFSLUITREDEN', key='afs_id', attributes=['afs_afsluitreden_code'], size=0, prefill=True )
""" A sample pygrametl program """ __author__ = 'Mathias Claus Jensen' import pygrametl from pygrametl.datasources import SQLSource from pygrametl.tables import Dimension, FactTable import sqlite3 input_conn = sqlite3.connect('input.db') output_conn = sqlite3.connect('output.db') input_src = SQLSource(input_conn, query='SELECT * dim1') output_wrapper = pygrametl.ConnectionWrapper(connection=output_conn) dim1 = Dimension(name='dim1', key='key1', attributes=['attr1', 'attr2']) dim1 = Dimension(name='dim2', key='key2', attributes=['attr3', 'attr4']) ft1 = FactTable(name='ft1', keyrefs=[ 'key1', ]) input_conn.close() output_conn.close()
import pyodbc import pygrametl import ConnectionStrings as CS import SourceSQLQueries as SSQ from datetime import datetime from pygrametl.datasources import TypedCSVSource, SQLSource from pygrametl.tables import Dimension, TypeOneSlowlyChangingDimension, FactTable, AccumulatingSnapshotFactTable # Open a connection to the OLTP AntBil AntBilReplication_conn = pyodbc.connect(CS.AntBilReplication_string) # Open a connection to the DW AntBil and create a ConnectionWrapper AntBilDW_conn = pyodbc.connect(CS.AntBilDW_string) AntBilDW_conn_wrapper = pygrametl.ConnectionWrapper(AntBilDW_conn) AntBilDW_conn_wrapper.setasdefault() # Create the data source of each dimension table attribute_mapping = {'DateKey' : int, 'DayOfWeek' : int, 'DayOfMonth' : int, 'DayOfYear' : int, 'WeekOfYear' : int, \ 'MonthOfYear' : int, 'CalendarQuarter' : int, 'CalendarYear' : int, 'FiscalMonthOfYear' : int, \ 'FiscalQuarter' : int, 'FiscalYear' : int} DimDate_source = TypedCSVSource(f=open('DimDate_2017-2037.csv', 'r', 16384), casts=attribute_mapping, delimiter=',') DimGroup_source = SQLSource(connection=AntBilReplication_conn, query=SSQ.DimGroup_query) DimGroupCategory_source = SQLSource(connection=AntBilReplication_conn, query=SSQ.DimGroupCategory_query) DimRole_source = SQLSource(connection=AntBilReplication_conn, query=SSQ.DimRole_query) DimCandidate_source = SQLSource(connection=AntBilReplication_conn,
# -*- coding: utf-8 -*- import datetime import sys import time sys.path.append('/home/Documents') #donde está ubicado pygrametl import pygrametl from pygrametl.datasources import CSVSource, MergeJoiningSource from pygrametl.tables import CachedDimension, SnowflakedDimension,BulkFactTable # Connection to target DW: import MySQLdb myconn = MySQLdb.connect(user='******', passwd='hola',db='Estadisticas') connection=pygrametl.ConnectionWrapper(myconn) connection.setasdefault() def loader(name,atts,fieldsep,rowsep,nullval,filehandle): curs=MySQLConnection.cursor() curs.copy_from(file=filehandle,table=name,sep=fieldsep, null=str(nullval),columns=atts) #base de datos sgbstdn = CachedDimension( name='SGBSTDN', key = 'matricula', attributes = ['nombre','paterno', 'materno', 'degc_code','class_code'], lookupatts = ['matricula'] )
def load_dimensions(output_conn): dw_conn_wrapper = pygrametl.ConnectionWrapper(connection=output_conn) ret = dict() ret['dim_datetime'] = CachedDimension(name='dim_datetime', key='datetime_id', attributes=[ 'epoch', 'minute', 'minute_20', 'minute_30', 'hour', 'day_of_week', 'day_of_month', 'week', 'month', 'year', 'period' ], lookupatts=['epoch'], size=0, prefill=True, targetconnection=dw_conn_wrapper) ret['dim_location'] = TypeOneSlowlyChangingDimension( name='dim_location', key='location_id', attributes=[ 'lookup_location', 'initial_id', 'company_code', 'street', 'ward', 'district', 'city', 'area', 'country', 'level1flag', 'level2flag', 'level3flag', 'level4flag', 'level5flag', 'level6flag', ], lookupatts=['lookup_location'], cachesize=0, prefill=True, targetconnection=dw_conn_wrapper) ret['dim_employee'] = TypeOneSlowlyChangingDimension( name='dim_employee', key='employee_id', attributes=[ 'lookup_employee', 'initial_id', 'company_code', 'login', 'name', 'active', 'mobile', 'email' ], lookupatts=['lookup_employee'], cachesize=0, prefill=True, targetconnection=dw_conn_wrapper) ret['dim_partner'] = TypeOneSlowlyChangingDimension( name='dim_partner', key='partner_id', attributes=[ 'lookup_partner', 'initial_id', 'company_code', 'name', 'ref', 'is_company', 'active', 'customer', 'supplier', 'employee', 'state', 'seq', 'seq_order', 'street_id', 'classify', 'total_sh' ], lookupatts=['lookup_partner'], cachesize=0, prefill=True, targetconnection=dw_conn_wrapper) ret['dim_company'] = TypeOneSlowlyChangingDimension( name='dim_company', key='company_id', attributes=['company_code', 'company_name'], lookupatts=['company_code'], cachesize=0, prefill=True, targetconnection=dw_conn_wrapper) return ret
def run_fact_etl(fact_name, class_name, pygram_fact_factory, source_sql, source_conn, output_conn, create_sql, dimensions={}): # print current time print('current time is {}'.format(datetime.datetime.now())) # create connection to dw dw_conn_wrapper = pygrametl.ConnectionWrapper(connection=output_conn) # TODO: add try statement to raise error # create fact_table_object pygram_fact_class = pygram_fact_factory["class"] pygram_fact_object = pygram_fact_class( name=pygram_fact_factory["name"], measures=pygram_fact_factory["measures"], keyrefs=pygram_fact_factory["keyrefs"], targetconnection=dw_conn_wrapper) # create fact table by create_sql cursor = output_conn.cursor() logger.info('create {} if not exist'.format(fact_name)) print('create {} if not exist'.format(fact_name)) cursor.execute(create_sql) output_conn.commit() # create index for each item of primary key group logger.info('create index of {} if not exist'.format(fact_name)) print('create index of {} if not exist'.format(fact_name)) for keyref in pygram_fact_factory['keyrefs']: cursor.execute('''CREATE INDEX IF NOT EXISTS {}_{}_idx ON {}({})'''.format(fact_name, keyref, fact_name, keyref)) output_conn.commit() # Create data_source logger.info('start query {}'.format(fact_name)) print('start query {}'.format(fact_name)) data_source = SQLSource(connection=source_conn, query=source_sql) # handle fact final_source = transform_handle(class_name, fact_name, data_source) # ensure into fact table list_data_source = list(final_source) length_source = len(list_data_source) if length_source == 0: logger.info('no record in query period') print('no record in query period') else: count = 1 for row in list_data_source: row = add_foreign_keys(row, pygram_fact_factory["keyrefs"], dimensions) # logger debug pkey and value of row dict_keyref = {} for keyref in pygram_fact_factory['keyrefs']: dict_keyref[keyref] = row[keyref] for measure in pygram_fact_factory['measures']: dict_keyref[measure] = row[measure] logger.debug('row {}:{}'.format(count, dict_keyref)) # The row can then be inserted into the fact table pygram_fact_object.ensure(row) progress(count, length_source, status='{}'.format(fact_name)) count += 1 print('done') output_conn.commit()
def run_dimension_etl(dimension_name, class_name, pygram_dimension_factory, source_sql, source_conn, output_conn, create_sql): """ This function can be used in any kind of workflow (for example in a celery task) or in a simple main program. """ # TODO: add null user to employee dimension # print current time print('current time is {}'.format(datetime.datetime.now())) # connection wrapper dw_conn_wrapper = pygrametl.ConnectionWrapper(connection=output_conn) # create dimension table by create_sql cursor = output_conn.cursor() logger.info('create {} if not exist'.format(dimension_name)) print('create {} if not exist'.format(dimension_name)) cursor.execute(create_sql) output_conn.commit() # create index for dimension logger.info('create index of {} if not exist'.format(dimension_name)) print('create index of {} if not exist'.format(dimension_name)) for lookupatt in pygram_dimension_factory['lookupatts']: cursor.execute('''CREATE INDEX IF NOT EXISTS {}_{}_idx ON {}({})'''.format(dimension_name, lookupatt, dimension_name, lookupatt)) output_conn.commit() # Create dimension pygram_dim_class = pygram_dimension_factory["class"] pygram_dim_object = pygram_dim_class( name=pygram_dimension_factory["name"], key=pygram_dimension_factory["key"], attributes=pygram_dimension_factory["attributes"], lookupatts=pygram_dimension_factory["lookupatts"], targetconnection=dw_conn_wrapper, cachesize=0, prefill=True) # TODO: handle datetime dimension here # Create data_source logger.info('start query {}'.format(dimension_name)) print('start query {}'.format(dimension_name)) if dimension_name in [ 'dim_datetime', 'dim_company', 'dim_call_center', 'dim_dong_ho_o', 'dim_dong_ho_tong', 'dim_hoa_don_tai_chinh' ]: final_source = source_sql else: data_source = SQLSource(connection=source_conn, query=source_sql) final_source = transform_handle(class_name, dimension_name, data_source) # Ensure row into dimension list_data_source = list(final_source) length_source = len(list_data_source) count = 1 for row in list_data_source: pygram_dim_object.scdensure(row) progress(count, length_source, status='{}'.format(dimension_name)) count += 1 print('done') output_conn.commit()
def run(self): self.file_name = True self.configure() if not self.file_name: return self.configureAudit() if self.verbose: print('Running job:', self.__class__.__name__) # print('Configuration:', self.conf) print('Arguments:', self.args) try: self.beforeJobAudit() counts = { 'extract': 0, 'insert': 0, 'update': 0, 'error': 0, } # Wrap the connection to use by pygrametl self.target_connection_wrap = pygrametl.ConnectionWrapper( connection=self.target_connection) source = self.getSource() target = self.getTarget() for row in source: # Catch any incompatible data warnings counts['extract'] += 1 try: prepared = self.prepareRow(row) if self.verbose: print('Inserting row:', prepared) self.insertRow(target, prepared) is_insert = True # where to get this info if is_insert: counts['insert'] += 1 else: counts['update'] += 1 if self.debug: # Commit in debug mode so we can see inserted rows self.target_connection_wrap.commit() input( 'Row inserted successfully. Press Enter to continue...' ) except Exception as e: counts['error'] += 1 # Should we log smth here print(e) self.logWarning(row, counts['extract']) if self.verbose or self.debug: print('Row could not be inserted due to an error.', row) if self.debug: input('Press Enter to continue...') self.afterJobAudit(counts) if self.verbose: print("Commit the target database") self.target_connection.commit() self.target_connection.close() except Exception as e: self.logError(e) self.close()