def get_data(self, columns, limit=0, offset=0): """ Extract data from a table :param columns: The table's columns to have data extracted :param limit: The limit of data extracted :param offset: The offset to extract the data :return: A list() with the data extracted """ if isinstance(columns, list): columns = TableExtractor.prepare_columns(columns) query = "SELECT " + columns + " FROM " + self.table_schema + "." + self.table_name if offset > 0: query += " OFFSET " + str(offset) if limit > 0: query += " LIMIT " + str(limit) data = list(SQLSource(connection=self.db_pgconn, query=query)) sample_list = list() for sample_dict in data: sample_list.append(sample_dict) return sample_list
def get_special_columns(self, only_fk=False): """ Extracts all primary and foreing keys columns of a table :return: A list() with all the primary and foreing keys columns """ if self.special_columns: return self.special_columns query = "SELECT \ ccu.table_name, \ ccu.table_schema, \ kc.column_name, \ tc.constraint_type \ FROM information_schema.table_constraints tc \ JOIN information_schema.key_column_usage kc ON kc.table_name = tc.table_name AND kc.constraint_name = tc.constraint_name \ JOIN information_schema.constraint_column_usage AS ccu ON ccu.constraint_name = tc.constraint_name \ WHERE tc.table_name = '" + self.table_name + "' AND tc.table_schema = '" + self.table_schema + "'" if only_fk: query += " AND tc.constraint_type='FOREIGN KEY'" self.special_columns = list( SQLSource(connection=self.db_pgconn, query=query)) return self.special_columns
def get_columns(self, special_columns=True, limit=0): """ Extracts the columns of a table :param limit: The limits of columns to extract :return: A list() with the names of the columns that match with the search """ query = "SELECT column_name, data_type, character_maximum_length, numeric_precision, is_nullable \ FROM information_schema.columns \ WHERE table_schema = '" + self.table_schema + "' AND table_name = '" + self.table_name + "'" if not special_columns and len(self.get_special_columns()) > 0: query += " AND " + TableExtractor.not_special_columns_where_statement( self.get_special_columns()) where_ignore_columns = TableExtractor.get_ignore_columns( self.db_config) if where_ignore_columns != '': query += " AND " + where_ignore_columns if limit > 0: query += " LIMIT " + str(limit) data = list(SQLSource(connection=self.db_pgconn, query=query)) return data
def getSource(self): self.source_connection = self.getDatabaseConnection( self.source_database) query = self.getSqlQuery() name_mapping = self.getColumnMapping() source = SQLSource(connection=self.source_connection, query=query, names=name_mapping) return source
def get_data(self): query = "SELECT " + self.columns + " FROM " + self.main_table.db_schema.name + '.' + self.main_table.name + self.joins data = list(SQLSource(connection=self.db_pgconn, query=query)) sample_list = list() for sample_dict in data: sample_list.append(sample_dict) return sample_list
def get_tables(self, schemas, search=None): """ Extracts all the tables from a database :param schemas: The schema to search at :param search: The key work to compare with the tables names :return: A list() of tables that match with the search """ schemas_where_statement = DatabaseExtractor.prepare_schemas_where_statement( schemas) query = "SELECT \ table_name, table_schema \ FROM information_schema.tables \ WHERE " + schemas_where_statement + " {search} \ ORDER BY table_name;" if search is not None: query = query.format(search=search) else: query = query.format(search="") return list(SQLSource(connection=self.db_pgconn, query=query))
# all the pygrametl abstractions that needs it with being passed around dw_string = "host='localhost' dbname='etl' user='******' password='******' port=54320" dw_pgconn = psycopg2.connect(dw_string) # Although the ConnectionWrapper is shared automatically between pygrametl # abstractions, we still save in in a variable to allow for it to be closed dw_conn_wrapper = pygrametl.ConnectionWrapper(connection=dw_pgconn) # As the location dimension stores the name of a location in the attribute # "city" instead of in the attribute "store" as in the input data from the # sales relation, a sequence of names matching the number of attributes in # the relation is created, allowing the SQLSource to do the mapping for us name_mapping= 'book', 'genre', 'city', 'timestamp', 'sale' # Extraction of rows from a database using a PEP 249 connection and SQL sales_source = SQLSource(connection=sales_pgconn, \ query="SELECT * FROM sales", names=name_mapping) # Extraction of rows from a CSV file does not require SQL, just an open file # handle to the file, as pygrametl uses Pythons DictReader for CSV files, # and the header of the CSV file contains information about each column. region_file_handle = open('c:\\work\\python\\region.csv', 'r', 16384) region_source = CSVSource(f=region_file_handle, delimiter=',') # An instance of Dimension is created for each dimension in the data # warehouse. For each table, the name of the table, the primary key of # the table, and a list of non key attributes in the table, are added. # In addition, for the location dimension we specify which attributes # should be used for a lookup of the primary key, as only the city is # present in the sales database and is enough to perform a lookup of # a unique primary key. As mentioned in the beginning of the guide, using # named parameters is strongly encouraged.
# Open a connection to the OLTP AntBil AntBilReplication_conn = pyodbc.connect(CS.AntBilReplication_string) # Open a connection to the DW AntBil and create a ConnectionWrapper AntBilDW_conn = pyodbc.connect(CS.AntBilDW_string) AntBilDW_conn_wrapper = pygrametl.ConnectionWrapper(AntBilDW_conn) AntBilDW_conn_wrapper.setasdefault() # Create the data source of each dimension table attribute_mapping = {'DateKey' : int, 'DayOfWeek' : int, 'DayOfMonth' : int, 'DayOfYear' : int, 'WeekOfYear' : int, \ 'MonthOfYear' : int, 'CalendarQuarter' : int, 'CalendarYear' : int, 'FiscalMonthOfYear' : int, \ 'FiscalQuarter' : int, 'FiscalYear' : int} DimDate_source = TypedCSVSource(f=open('DimDate_2017-2037.csv', 'r', 16384), casts=attribute_mapping, delimiter=',') DimGroup_source = SQLSource(connection=AntBilReplication_conn, query=SSQ.DimGroup_query) DimGroupCategory_source = SQLSource(connection=AntBilReplication_conn, query=SSQ.DimGroupCategory_query) DimRole_source = SQLSource(connection=AntBilReplication_conn, query=SSQ.DimRole_query) DimCandidate_source = SQLSource(connection=AntBilReplication_conn, query=SSQ.DimCandidate_query) DimMeetingType_source = SQLSource(connection=AntBilReplication_conn, query=SSQ.DimMeetingType_query) DimBadge_source = SQLSource(connection=AntBilReplication_conn, query=SSQ.DimBadge_query) DimLocation_source = SQLSource(connection=AntBilReplication_conn, query=SSQ.DimLocation_query) DimLevel_source = SQLSource(connection=AntBilReplication_conn, query=SSQ.DimLevel_query)
""" A sample pygrametl program """ __author__ = 'Mathias Claus Jensen' import pygrametl from pygrametl.datasources import SQLSource from pygrametl.tables import Dimension, FactTable import sqlite3 input_conn = sqlite3.connect('input.db') output_conn = sqlite3.connect('output.db') input_src = SQLSource(input_conn, query='SELECT * dim1') output_wrapper = pygrametl.ConnectionWrapper(connection=output_conn) dim1 = Dimension(name='dim1', key='key1', attributes=['attr1', 'attr2']) dim1 = Dimension(name='dim2', key='key2', attributes=['attr3', 'attr4']) ft1 = FactTable(name='ft1', keyrefs=[ 'key1', ]) input_conn.close() output_conn.close()
def run_fact_etl(fact_name, class_name, pygram_fact_factory, source_sql, source_conn, output_conn, create_sql, dimensions={}): # print current time print('current time is {}'.format(datetime.datetime.now())) # create connection to dw dw_conn_wrapper = pygrametl.ConnectionWrapper(connection=output_conn) # TODO: add try statement to raise error # create fact_table_object pygram_fact_class = pygram_fact_factory["class"] pygram_fact_object = pygram_fact_class( name=pygram_fact_factory["name"], measures=pygram_fact_factory["measures"], keyrefs=pygram_fact_factory["keyrefs"], targetconnection=dw_conn_wrapper) # create fact table by create_sql cursor = output_conn.cursor() logger.info('create {} if not exist'.format(fact_name)) print('create {} if not exist'.format(fact_name)) cursor.execute(create_sql) output_conn.commit() # create index for each item of primary key group logger.info('create index of {} if not exist'.format(fact_name)) print('create index of {} if not exist'.format(fact_name)) for keyref in pygram_fact_factory['keyrefs']: cursor.execute('''CREATE INDEX IF NOT EXISTS {}_{}_idx ON {}({})'''.format(fact_name, keyref, fact_name, keyref)) output_conn.commit() # Create data_source logger.info('start query {}'.format(fact_name)) print('start query {}'.format(fact_name)) data_source = SQLSource(connection=source_conn, query=source_sql) # handle fact final_source = transform_handle(class_name, fact_name, data_source) # ensure into fact table list_data_source = list(final_source) length_source = len(list_data_source) if length_source == 0: logger.info('no record in query period') print('no record in query period') else: count = 1 for row in list_data_source: row = add_foreign_keys(row, pygram_fact_factory["keyrefs"], dimensions) # logger debug pkey and value of row dict_keyref = {} for keyref in pygram_fact_factory['keyrefs']: dict_keyref[keyref] = row[keyref] for measure in pygram_fact_factory['measures']: dict_keyref[measure] = row[measure] logger.debug('row {}:{}'.format(count, dict_keyref)) # The row can then be inserted into the fact table pygram_fact_object.ensure(row) progress(count, length_source, status='{}'.format(fact_name)) count += 1 print('done') output_conn.commit()
def run_dimension_etl(dimension_name, class_name, pygram_dimension_factory, source_sql, source_conn, output_conn, create_sql): """ This function can be used in any kind of workflow (for example in a celery task) or in a simple main program. """ # TODO: add null user to employee dimension # print current time print('current time is {}'.format(datetime.datetime.now())) # connection wrapper dw_conn_wrapper = pygrametl.ConnectionWrapper(connection=output_conn) # create dimension table by create_sql cursor = output_conn.cursor() logger.info('create {} if not exist'.format(dimension_name)) print('create {} if not exist'.format(dimension_name)) cursor.execute(create_sql) output_conn.commit() # create index for dimension logger.info('create index of {} if not exist'.format(dimension_name)) print('create index of {} if not exist'.format(dimension_name)) for lookupatt in pygram_dimension_factory['lookupatts']: cursor.execute('''CREATE INDEX IF NOT EXISTS {}_{}_idx ON {}({})'''.format(dimension_name, lookupatt, dimension_name, lookupatt)) output_conn.commit() # Create dimension pygram_dim_class = pygram_dimension_factory["class"] pygram_dim_object = pygram_dim_class( name=pygram_dimension_factory["name"], key=pygram_dimension_factory["key"], attributes=pygram_dimension_factory["attributes"], lookupatts=pygram_dimension_factory["lookupatts"], targetconnection=dw_conn_wrapper, cachesize=0, prefill=True) # TODO: handle datetime dimension here # Create data_source logger.info('start query {}'.format(dimension_name)) print('start query {}'.format(dimension_name)) if dimension_name in [ 'dim_datetime', 'dim_company', 'dim_call_center', 'dim_dong_ho_o', 'dim_dong_ho_tong', 'dim_hoa_don_tai_chinh' ]: final_source = source_sql else: data_source = SQLSource(connection=source_conn, query=source_sql) final_source = transform_handle(class_name, dimension_name, data_source) # Ensure row into dimension list_data_source = list(final_source) length_source = len(list_data_source) count = 1 for row in list_data_source: pygram_dim_object.scdensure(row) progress(count, length_source, status='{}'.format(dimension_name)) count += 1 print('done') output_conn.commit()
""" A sample pygrametl program """ __author__ = 'Mathias Claus Jensen' import pygrametl from pygrametl.datasources import SQLSource from pygrametl.tables import Dimension, FactTable import sqlite3 input_conn = sqlite3.connect('input.db') output_conn = sqlite3.connect('output.db') input_src = SQLSource(input_conn, query='SELECT * FROM table') output_wrapper = pygrametl.ConnectionWrapper(connection=output_conn) dim1 = Dimension('dim1', 'key1', ['attr1', 'attr2']) dim2 = Dimension(name='dim2', key='key2', attributes=['attr3', 'attr4']) ft1 = FactTable(name='ft1', keyrefs=[ 'key1', ]) input_conn.close() output_conn.close()