def copy_s3_to_staging(self, table_name):
        '''
        Takes connection parameters and a table name. 
        Copies that file in s3 to a staging table in redshift.
        '''

        redshift_params = self.con_params
        ##############################################
        ##############################################
        #I should point out that the databse name and
        #IAM role was hardcoded here. That's garbage.
        ##############################################
        ##############################################

        with open_redshift_db_connection(redshift_params) as c:
            try:
                command = """COPY """ + table_name + """_staging
                    FROM 's3://ichain-sync-machine/""" + 'DEAHINPHSDB' + '/' + table_name + '/' + table_name + """-changes.csv'
                    IAM_ROLE 'arn:aws:iam::265991248033:role/myRedShiftforKin'
                    delimiter ','
                    IGNOREHEADER 1 removequotes emptyasnull blanksasnull maxerror 1
                    COMPUPDATE OFF STATUPDATE OFF;
                    """
                #print command
                c.execute(command)
            except Exception as e:
                #print (e)
                return False
        return True
Ejemplo n.º 2
0
    def init_staging_tables(self):
        '''
        this is overriding all column datatypes...
        TODO: I should really be querying the source db and finding out
        the data types for each column instead of just setting them the way I am
        now. 

        TODO:I should also be looking at the staging table to make sure that there 
        are no more rows left to be delserted into produciton. 
        '''
        for table in self.src_db.tables_with_pks:
            columns_info_list = self.src_db.get_column_names(table)
            #Forgive me for I have hardcoded...
            cols_and_types = '__$start_lsn varchar(255), __$operation varchar(255), __$update_mask varchar(255)'
            #This is dumb, I'm overriding datatypes for all columns...
            for row in columns_info_list:
                cols_and_types += ',\n' + row[3] + ' ' + 'varchar(255)'
            pk = self.src_db.get_table_primary_key(table)

            if not pk:
                pk = ''
            command = """CREATE TABLE """ + table + """_staging(
            """ + cols_and_types + """,
            PRIMARY KEY (""" + str(pk) + """)\n);"""
            print command
            with open_redshift_db_connection(redshift_params) as c:
                c.execute(command)
        return True
    def redshift_delsert(self, table, pk, column_name_list):
        '''
        TODO: Design has changed, Instead of retrieving column name list
        and primary key from config folder in s3, I'm now quering that info
        directly from the source database. That means that I now need to 
        make space in the redshift instantiation for this. 
        '''

        con_params = self.con_params
        with open_redshift_db_connection(con_params) as c:
            #Delete rows that were deleted or updated
            command = '''DELETE FROM ''' + table + '''
            USING ''' + table + '''_staging a
            WHERE ''' + table + '''.''' + pk + '''=a.''' + pk + ''' AND (__$operation LIKE '%4%' OR __$operation LIKE '%1%');'''

            c.execute(command)
            ## Insert rows to redshift prod  that were inserted or updated in src
            res = column_name_list[0].lower()
            for i in column_name_list[1:]:
                res += ',' + i.lower()

            command = '''INSERT INTO ''' + table.upper(
            ) + ''' (''' + res + ''')
            SELECT ''' + res + ''' FROM ''' + table.upper() + '''_staging s
            WHERE __$operation LIKE '%4%' OR __$operation LIKE '%2';'''
            #print command
            c.execute(command)
        return True
def test2(redshift_params):
    res = 'yis'
    with open_redshift_db_connection(redshift_params) as c:
        c.execute("""SELECT * FROM DEAA0_staging;""")
        if c.fetchall():
            res = c.fetchall()
        print res
        return res
def create_staging_table(redshift_params):
    '''
    '''
    with open_redshift_db_connection(redshift_params) as c:
        c.execute("CREATE TABLE sometable")
        rows = c.fetchall()
        for i in rows:
            print i
    return
 def redshift_test(self, rs_params):
     '''
     This is a simple test of the redshift db. Prints 10 rows from a toytable in 
     the redshift dev database. 
     '''
     with open_redshift_db_connection(rs_params) as c:
         c.execute("SELECT * FROM toytable LIMIT 10;")
         rows = c.fetchall()
         for i in rows:
             print i
     return True
    def truncate_redshift_staging_table(self, table_name):
        '''
        This deletes the rows in a STAGING table without deleting the columnn names.
        '''

        con_params = self.con_params
        command = ("""Truncate """ + table_name + """_staging;""")
        with open_redshift_db_connection(con_params) as c:
            try:
                c.execute(command)
            except Exception as e:
                print(e)
                return False
        return True
def insert_rs_rows(con_params, table_name):
    '''
    This will insert rows from the staging table in redshift to the proper table
    command = """
    DELETE FROM """++"""
    USING """++""""s"""+
    AND (__$operation = '3' OR __$operation = '4');
    AND (__$operation = '3' OR __$operation = '4');
    '''
    with open_redshift_db_connection(redshift_params) as c:
        try:
            c.execute(command)
        except Exception as e:
            print(e)
    return c.fetchall()
def test(redshift_params):
    '''
    This is a simple test of the redshift db. Prints 10 rows from a toytable in 
    the redshift dev database. 
    '''
    df = object
    cols = object
    with open_redshift_db_connection(redshift_params) as c:
        c.execute("SELECT * FROM toytable LIMIT 10;")
        print c.fetchall()
        '''
        df = DataFrame(c.fetchall())
        cols = [col[0] for col in c.description]
        df = df.to_csv(sep = ',')
        '''
    return df, cols
def test3(redshift_params):
    res = 'yis'
    print res
    with open_redshift_db_connection(redshift_params) as c:
        c.execute("""SELECT
                DISTINCT tablename
                FROM PG_TABLE_DEF
                WHERE schemaname = 'public';""")
        if c.fetchall():
            res = c.fetchall()
            for i in res:
                print i
                print i
        print res
        print "okay"
        return res
def del_rs_rows(con_params, table_name):
    '''
    This will delete rows from the staging table in redshift to the proper table
    command = """
    DELETE FROM """++"""
    USING """++""""s"""+
    """
    Where"""++"""   """"
    AND (__$operation = '3' OR __$operation = '4');
    """
    '''
    with open_redshift_db_connection(redshift_params) as c:
        try:
            c.execute(command)
        except Exception as e:
            print(e)
    return
Ejemplo n.º 12
0
 def init_rs_dummy_tables(self):
     '''
     This is just making an empty table in redshift to test the delsert command 
     since I haven't populated the redshift db with data yet. 
     '''
     for table in self.src_db.tables_with_pks:
         columns_info_list = self.src_db.get_column_names(table)
         cols_and_types = ''
         cols_and_types += columns_info_list[0][3] + ' ' + 'varchar(255)'
         for row in columns_info_list[1:]:
             cols_and_types += ',\n' + row[3] + ' ' + 'varchar(255)'
         pk = self.src_db.get_table_primary_key(table)
         if not pk:
             pk = ''
         command = """CREATE TABLE """ + table.lower() + """(
         """ + cols_and_types + """,
         PRIMARY KEY (""" + str(pk) + """)\n);"""
         print command
         with open_redshift_db_connection(redshift_params) as c:
             c.execute(command)
     return True
    def vacuum_redshift_db(self, table=None):
        '''
        From AWS documentation:
        'Reclaims space and resorts rows in either a specified 
        table or all tables in the current database.'
        SEE: https://docs.aws.amazon.com/redshift/latest/dg/r_VACUUM_command.html
        
        Default is to vacuum ALL tables. If you supply this method with a 
        table name, only that table will be vacuumed
        '''
        con_params = self.con_params
        if table:
            command = 'vacuum ' + table + ';'
        else:
            command = 'vacuum;'
        with open_redshift_db_connection(con_params) as c:
            try:
                c.execute(command)
            except Exception as e:
                print(e)
                return False

        return True
    def analyze_redshift_db(self, table=None):
        '''
        From AWS documentation:
        'Updates table statistics for use by the query planner.'
        SEE: https://docs.aws.amazon.com/redshift/latest/dg/r_ANALYZE.html
        
        Default is to analyze ALL tables. If you supply this method with a 
        table name, only that table will be analyzed.
        '''

        con_params = self.con_params

        if table:
            command = 'analyze' + table + ';'
        else:
            command = 'analyze verbose;'

        with open_redshift_db_connection(con_params) as c:
            try:
                c.execute(command)
            except Exception as e:
                print(e)
                return False
        return True