Exemple #1
0
    def get_data(self, columns, limit=0, offset=0):
        """
        Extract data from a table
        :param columns: The table's columns to have data extracted
        :param limit: The limit of data extracted
        :param offset: The offset to extract the data
        :return: A list() with the data extracted
        """
        if isinstance(columns, list):
            columns = TableExtractor.prepare_columns(columns)

        query = "SELECT " + columns + " FROM " + self.table_schema + "." + self.table_name

        if offset > 0:
            query += " OFFSET " + str(offset)

        if limit > 0:
            query += " LIMIT " + str(limit)

        data = list(SQLSource(connection=self.db_pgconn, query=query))

        sample_list = list()
        for sample_dict in data:
            sample_list.append(sample_dict)

        return sample_list
Exemple #2
0
    def get_special_columns(self, only_fk=False):
        """
        Extracts all primary and foreing keys columns of a table
        :return: A list() with all the primary and foreing keys columns
        """

        if self.special_columns:
            return self.special_columns

        query = "SELECT \
                    ccu.table_name, \
                    ccu.table_schema, \
                    kc.column_name, \
                    tc.constraint_type \
                FROM information_schema.table_constraints tc \
                JOIN information_schema.key_column_usage kc ON kc.table_name = tc.table_name AND kc.constraint_name = tc.constraint_name \
                JOIN information_schema.constraint_column_usage AS ccu ON ccu.constraint_name = tc.constraint_name \
                WHERE tc.table_name = '" + self.table_name + "' AND tc.table_schema = '" + self.table_schema + "'"

        if only_fk:
            query += " AND tc.constraint_type='FOREIGN KEY'"

        self.special_columns = list(
            SQLSource(connection=self.db_pgconn, query=query))

        return self.special_columns
Exemple #3
0
    def get_columns(self, special_columns=True, limit=0):
        """
        Extracts the columns of a table
        :param limit: The limits of columns to extract
        :return: A list() with the names of the columns that match with the search
        """
        query = "SELECT  column_name, data_type, character_maximum_length, numeric_precision, is_nullable \
                FROM information_schema.columns \
                WHERE table_schema = '" + self.table_schema + "' AND table_name = '" + self.table_name + "'"

        if not special_columns and len(self.get_special_columns()) > 0:
            query += " AND " + TableExtractor.not_special_columns_where_statement(
                self.get_special_columns())

        where_ignore_columns = TableExtractor.get_ignore_columns(
            self.db_config)

        if where_ignore_columns != '':
            query += " AND " + where_ignore_columns

        if limit > 0:
            query += " LIMIT " + str(limit)

        data = list(SQLSource(connection=self.db_pgconn, query=query))

        return data
Exemple #4
0
 def getSource(self):
     self.source_connection = self.getDatabaseConnection(
         self.source_database)
     query = self.getSqlQuery()
     name_mapping = self.getColumnMapping()
     source = SQLSource(connection=self.source_connection,
                        query=query,
                        names=name_mapping)
     return source
Exemple #5
0
    def get_data(self):
        query = "SELECT " + self.columns + " FROM " + self.main_table.db_schema.name + '.' + self.main_table.name + self.joins

        data = list(SQLSource(connection=self.db_pgconn, query=query))

        sample_list = list()
        for sample_dict in data:
            sample_list.append(sample_dict)

        return sample_list
Exemple #6
0
    def get_tables(self, schemas, search=None):
        """
        Extracts all the tables from a database
        :param schemas: The schema to search at
        :param search: The key work to compare with the tables names
        :return: A list() of tables that match with the search
        """
        schemas_where_statement = DatabaseExtractor.prepare_schemas_where_statement(
            schemas)

        query = "SELECT \
                 table_name, table_schema \
                FROM information_schema.tables \
                WHERE " + schemas_where_statement + " {search} \
                ORDER BY table_name;"

        if search is not None:
            query = query.format(search=search)
        else:
            query = query.format(search="")

        return list(SQLSource(connection=self.db_pgconn, query=query))
# all the pygrametl abstractions that needs it with being passed around
dw_string = "host='localhost' dbname='etl' user='******' password='******' port=54320"
dw_pgconn = psycopg2.connect(dw_string)

# Although the ConnectionWrapper is shared automatically between pygrametl
# abstractions, we still save in in a variable to allow for it to be closed
dw_conn_wrapper = pygrametl.ConnectionWrapper(connection=dw_pgconn)

# As the location dimension stores the name of a location in the attribute
# "city" instead of in the attribute "store" as in the input data from the
# sales relation, a sequence of names matching the number of attributes in
# the relation is created, allowing the SQLSource to do the mapping for us
name_mapping= 'book', 'genre', 'city', 'timestamp', 'sale'

# Extraction of rows from a database using a PEP 249 connection and SQL
sales_source = SQLSource(connection=sales_pgconn, \
                         query="SELECT * FROM sales", names=name_mapping)

# Extraction of rows from a CSV file does not require SQL, just an open file
# handle to the file, as pygrametl uses Pythons DictReader for CSV files,
# and the header of the CSV file contains information about each column.
region_file_handle = open('c:\\work\\python\\region.csv', 'r', 16384)
region_source = CSVSource(f=region_file_handle, delimiter=',')

# An instance of Dimension is created for each dimension in the data
# warehouse. For each table, the name of the table, the primary key of
# the table, and a list of non key attributes in the table, are added.
# In addition, for the location dimension we specify which attributes
# should be used for a lookup of the primary key, as only the city is
# present in the sales database and is enough to perform a lookup of
# a unique primary key. As mentioned in the beginning of the guide, using
# named parameters is strongly encouraged.
Exemple #8
0
# Open a connection to the OLTP AntBil
AntBilReplication_conn = pyodbc.connect(CS.AntBilReplication_string)

# Open a connection to the DW AntBil and create a ConnectionWrapper
AntBilDW_conn = pyodbc.connect(CS.AntBilDW_string)
AntBilDW_conn_wrapper = pygrametl.ConnectionWrapper(AntBilDW_conn)
AntBilDW_conn_wrapper.setasdefault()

# Create the data source of each dimension table
attribute_mapping = {'DateKey' : int, 'DayOfWeek' : int, 'DayOfMonth' : int, 'DayOfYear' : int, 'WeekOfYear' : int, \
                    'MonthOfYear' : int, 'CalendarQuarter' : int, 'CalendarYear' : int, 'FiscalMonthOfYear' : int, \
                    'FiscalQuarter' : int, 'FiscalYear' : int}
DimDate_source = TypedCSVSource(f=open('DimDate_2017-2037.csv', 'r', 16384),
                                casts=attribute_mapping,
                                delimiter=',')
DimGroup_source = SQLSource(connection=AntBilReplication_conn,
                            query=SSQ.DimGroup_query)
DimGroupCategory_source = SQLSource(connection=AntBilReplication_conn,
                                    query=SSQ.DimGroupCategory_query)
DimRole_source = SQLSource(connection=AntBilReplication_conn,
                           query=SSQ.DimRole_query)
DimCandidate_source = SQLSource(connection=AntBilReplication_conn,
                                query=SSQ.DimCandidate_query)
DimMeetingType_source = SQLSource(connection=AntBilReplication_conn,
                                  query=SSQ.DimMeetingType_query)
DimBadge_source = SQLSource(connection=AntBilReplication_conn,
                            query=SSQ.DimBadge_query)
DimLocation_source = SQLSource(connection=AntBilReplication_conn,
                               query=SSQ.DimLocation_query)
DimLevel_source = SQLSource(connection=AntBilReplication_conn,
                            query=SSQ.DimLevel_query)
Exemple #9
0
""" A sample pygrametl program
"""

__author__ = 'Mathias Claus Jensen'

import pygrametl
from pygrametl.datasources import SQLSource
from pygrametl.tables import Dimension, FactTable
import sqlite3

input_conn = sqlite3.connect('input.db')
output_conn = sqlite3.connect('output.db')

input_src = SQLSource(input_conn, query='SELECT * dim1')
output_wrapper = pygrametl.ConnectionWrapper(connection=output_conn)

dim1 = Dimension(name='dim1', key='key1', attributes=['attr1', 'attr2'])

dim1 = Dimension(name='dim2', key='key2', attributes=['attr3', 'attr4'])

ft1 = FactTable(name='ft1', keyrefs=[
    'key1',
])

input_conn.close()
output_conn.close()
Exemple #10
0
def run_fact_etl(fact_name,
                 class_name,
                 pygram_fact_factory,
                 source_sql,
                 source_conn,
                 output_conn,
                 create_sql,
                 dimensions={}):
    # print current time
    print('current time is {}'.format(datetime.datetime.now()))

    # create connection to dw
    dw_conn_wrapper = pygrametl.ConnectionWrapper(connection=output_conn)
    # TODO: add try statement to raise error

    # create fact_table_object
    pygram_fact_class = pygram_fact_factory["class"]
    pygram_fact_object = pygram_fact_class(
        name=pygram_fact_factory["name"],
        measures=pygram_fact_factory["measures"],
        keyrefs=pygram_fact_factory["keyrefs"],
        targetconnection=dw_conn_wrapper)

    # create fact table by create_sql
    cursor = output_conn.cursor()
    logger.info('create {} if not exist'.format(fact_name))
    print('create {} if not exist'.format(fact_name))
    cursor.execute(create_sql)
    output_conn.commit()

    # create index for each item of primary key group
    logger.info('create index of {} if not exist'.format(fact_name))
    print('create index of {} if not exist'.format(fact_name))
    for keyref in pygram_fact_factory['keyrefs']:
        cursor.execute('''CREATE INDEX IF NOT EXISTS {}_{}_idx
                  ON {}({})'''.format(fact_name, keyref, fact_name, keyref))
    output_conn.commit()

    # Create data_source
    logger.info('start query {}'.format(fact_name))
    print('start query {}'.format(fact_name))
    data_source = SQLSource(connection=source_conn, query=source_sql)

    # handle fact
    final_source = transform_handle(class_name, fact_name, data_source)

    # ensure into fact table
    list_data_source = list(final_source)
    length_source = len(list_data_source)
    if length_source == 0:
        logger.info('no record in query period')
        print('no record in query period')
    else:
        count = 1
        for row in list_data_source:
            row = add_foreign_keys(row, pygram_fact_factory["keyrefs"],
                                   dimensions)
            # logger debug pkey and value of row
            dict_keyref = {}
            for keyref in pygram_fact_factory['keyrefs']:
                dict_keyref[keyref] = row[keyref]
            for measure in pygram_fact_factory['measures']:
                dict_keyref[measure] = row[measure]
            logger.debug('row {}:{}'.format(count, dict_keyref))
            # The row can then be inserted into the fact table
            pygram_fact_object.ensure(row)
            progress(count, length_source, status='{}'.format(fact_name))
            count += 1
    print('done')
    output_conn.commit()
Exemple #11
0
def run_dimension_etl(dimension_name, class_name, pygram_dimension_factory,
                      source_sql, source_conn, output_conn, create_sql):
    """
    This function can be used in any kind of workflow (for example in a celery
    task) or in a simple main program.
    """
    # TODO: add null user to employee dimension
    # print current time
    print('current time is {}'.format(datetime.datetime.now()))
    # connection wrapper
    dw_conn_wrapper = pygrametl.ConnectionWrapper(connection=output_conn)

    # create dimension table by create_sql
    cursor = output_conn.cursor()
    logger.info('create {} if not exist'.format(dimension_name))
    print('create {} if not exist'.format(dimension_name))
    cursor.execute(create_sql)
    output_conn.commit()

    # create index for dimension
    logger.info('create index of {} if not exist'.format(dimension_name))
    print('create index of {} if not exist'.format(dimension_name))
    for lookupatt in pygram_dimension_factory['lookupatts']:
        cursor.execute('''CREATE INDEX IF NOT EXISTS {}_{}_idx
                      ON {}({})'''.format(dimension_name, lookupatt,
                                          dimension_name, lookupatt))
    output_conn.commit()

    # Create dimension
    pygram_dim_class = pygram_dimension_factory["class"]
    pygram_dim_object = pygram_dim_class(
        name=pygram_dimension_factory["name"],
        key=pygram_dimension_factory["key"],
        attributes=pygram_dimension_factory["attributes"],
        lookupatts=pygram_dimension_factory["lookupatts"],
        targetconnection=dw_conn_wrapper,
        cachesize=0,
        prefill=True)

    # TODO: handle datetime dimension here

    # Create data_source
    logger.info('start query {}'.format(dimension_name))
    print('start query {}'.format(dimension_name))
    if dimension_name in [
            'dim_datetime', 'dim_company', 'dim_call_center', 'dim_dong_ho_o',
            'dim_dong_ho_tong', 'dim_hoa_don_tai_chinh'
    ]:
        final_source = source_sql

    else:
        data_source = SQLSource(connection=source_conn, query=source_sql)
        final_source = transform_handle(class_name, dimension_name,
                                        data_source)

    # Ensure row into dimension
    list_data_source = list(final_source)
    length_source = len(list_data_source)
    count = 1
    for row in list_data_source:
        pygram_dim_object.scdensure(row)
        progress(count, length_source, status='{}'.format(dimension_name))
        count += 1
    print('done')

    output_conn.commit()
Exemple #12
0
""" A sample pygrametl program
"""

__author__ = 'Mathias Claus Jensen'

import pygrametl
from pygrametl.datasources import SQLSource
from pygrametl.tables import Dimension, FactTable
import sqlite3

input_conn = sqlite3.connect('input.db')
output_conn = sqlite3.connect('output.db')

input_src = SQLSource(input_conn, query='SELECT * FROM table')
output_wrapper = pygrametl.ConnectionWrapper(connection=output_conn)

dim1 = Dimension('dim1', 'key1', ['attr1', 'attr2'])

dim2 = Dimension(name='dim2', key='key2', attributes=['attr3', 'attr4'])

ft1 = FactTable(name='ft1', keyrefs=[
    'key1',
])

input_conn.close()
output_conn.close()