def clean_cdc_table(self, table): ''' This is used to delete the changes logged in the CDC table for a given table. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !!!!!!!!!!!IMPORTANT!!!!!!!!!!!!!!! !If you're debugging this method,!! !consider the '@threshhold' value.! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! TODO:This should be returning a boolean indicating the methods success/failur. ''' command = '''DECLARE @Last_LSN VARBINARY(10) =''' + self.end_time_lsn + ''' EXEC sys.sp_cdc_cleanup_change_table @capture_instance = 'dbo_''' + table + '''', @low_water_mark = @Last_LSN, @threshold = 100000''' with open_sql_db_connection(self.con_params) as cursor: try: cursor.execute(command) except: pass #rows = cursor.fetchone() return
def startup_procedures(self): ''' These are the start up procedures for ms sql db. -Check that CDC is enabled on all tables. If not, enable it. !!!!!!!IMPORTANT!!!!!!!! The most important thing this method does is: 1. Enable CDC on the ENTIRE SQL database. 2. Enable CDC on EACH table within the db. This method should do these things automatically but there are a few caveats, ex. these commands need to be ran as admin. For this reason, it may be necessary to manually execute the commands that perform those operations. ''' print "RUNNING MSSQL STARTUP PROCEDURES" with open_sql_db_connection(self.con_params) as cursor: if not self.check_CDC_on_user_tables(): print "CDC WAS NOT ENABLED ON ALL TABLES, ATTEMPTING TO ENABLE" if self.enable_CDC_on_tables(): print "CDC ENABLE WAS SUCCESSFUL" else: #RAISE AIRFLOW EXCEPTION!!!! print "CDC ENABLE FAILED" else: #RETURN TRUE FOR AIRFLOW!!!!! print "CDC IS PROPERLY ENABLED" return
def get_min_lsn(self, table): ''' Takes a table_name and returns the min lsn for that tables cdc-table. ''' with open_sql_db_connection(self.con_params) as cursor: command = """SELECT sys.fn_cdc_get_min_lsn('dbo_""" + table + """');""" cursor.execute(command) rows = cursor.fetchone() return self.lsn_bytearray_converter(str(rows[0]))
def insert(self): ''' this looks like a nonsense method. Not sure what I was thinking here... ''' with open_sql_db_connection(self.con_params) as cursor: command = """INSERT * FROM DEAA0 WHERE City LIKE 'Las A%' ORDER BY DEANumber""" cursor.execute(command) rows = cursor.fetchone()
def get_table_primary_key(self, tablename): ''' Takes table name, returns the primary key(s) for that table. ''' with open_sql_db_connection(self.con_params) as cursor: command = """exec sp_pkeys '""" + tablename + """';""" cursor.execute(command) result = cursor.fetchone() if result: return result[3] else: return None
def get_sql_end_lsn2(self): ''' DEPRECATED! Leaving him around just in case.. Returns an end date. This will be used to clean up the CDC table after the net changes are upoaded to s3. ''' command = """DECLARE @end_time datetime; SET @end_time = GETDATE(); SELECT @end_time;""" with open_sql_db_connection(self.con_params) as cursor: cursor.execute(command) end_time = cursor.fetchone() return end_time[0]
def get_table_changes(self, tableName): ''' Returns all the changes for a given table ''' with open_sql_db_connection(self.con_params) as cursor: try: command = "SELECT * FROM cdc.dbo_" + tableName + "_CT;" cursor.execute(command) rows = cursor.fetchall() return rows except: #this should be change to a log statement print "table didn't exist...maybe" return
def get_tables_tracked_by_CDC(self): ''' Returns all the tables for a given db which were NOT shipped with ms sql. In other words, these tables were created by user. ''' command = "SELECT * FROM sys.tables WHERE is_tracked_by_cdc = 1;" with open_sql_db_connection(self.con_params) as cursor: cursor.execute(command) rows = cursor.fetchall() table_names = [] for row in rows: table_names.append(row[0]) return table_names
def enable_CDC_on_db(self): ''' This enables CDC on the entire db. NOTES: -THIS NEEDS TO RETURN AN ERROR!!!!! -This operation is handled differently when the SQL instance is hosted on AWS rds, so consider this when debugging. ''' with open_sql_db_connection(self.con_params) as cursor: command = """exec msdb.dbo.rds_cdc_enable_db """ + self.con_params[ 'database'] + """';""" cursor.execute(command) return True
def get_sql_end_lsn(self): ''' Returns an end date. This will be used to clean up the CDC table after the net changes are uploaded to s3. ''' command = """DECLARE @end_time datetime, @to_lsn binary(10); SET @end_time = GETDATE(); SET @to_lsn = sys.fn_cdc_map_time_to_lsn('largest less than or equal', @end_time) SELECT @to_lsn;""" result = '0x' with open_sql_db_connection(self.con_params) as cursor: cursor.execute(command) end_time = cursor.fetchone() result += str(end_time[0]).encode('hex') return result
def get_table_names(self): ''' Returns a list of all the tables for a given db which were NOT shipped with ms sql. In other words, these tables were created by user. ''' table_names_list = [] with open_sql_db_connection(self.con_params) as cursor: command = "SELECT * FROM sys.tables WHERE is_ms_shipped = 0;" cursor.execute(command) rows = cursor.fetchall() for row in rows: table_names_list.append(row[0]) return table_names_list
def table_has_changes(self, table_name): ''' Takes a table name as a string, returns boolean if table has unloaded changes ''' from_lsn = self.get_min_lsn(table_name) to_lsn = self.end_time_lsn with open_sql_db_connection(self.con_params) as cursor: command = """ DECLARE @from_lsn binary(10), @to_lsn binary(10); SET @from_lsn =""" + str(from_lsn) + """; SET @to_lsn =""" + str(to_lsn) + """; SELECT COUNT(*) FROM cdc.fn_cdc_get_net_changes_dbo_""" + table_name + """(@from_lsn, @to_lsn, 'all');""" cursor.execute(command) rows = cursor.fetchone() return list(rows)[0]
def mssql_change_test(self): ''' This function is used to create a change in the database which is obviously necessary to populate the CDC table for a give table. ''' with open_sql_db_connection(self.con_params) as cursor: command = """SELECT TOP 10 * FROM DEAA0 WHERE City LIKE 'A%' ORDER BY DEANumber""" cursor.execute(command) rows = cursor.fetchone() print rows if rows: command = """UPDATE DEAA0 SET City = 'Las Gatos' WHERE ID = """ + str(rows[0]) + ';' cursor.execute(command) return
def nice_get_changes(self, table_name): ''' Takes a tables name as a string, returns a generator object that allows you to only pull in 30,000 rows of data into memory at a time. This method streams data using the 'smart_open' library. ''' from_lsn = self.get_min_lsn(table_name) to_lsn = self.end_time_lsn with open_sql_db_connection(self.con_params) as cursor: command = """ DECLARE @from_lsn binary(10), @to_lsn binary(10); SET @from_lsn =""" + str(from_lsn) + """; SET @to_lsn =""" + str(to_lsn) + """; SELECT * FROM cdc.fn_cdc_get_net_changes_dbo_""" + table_name + """(@from_lsn, @to_lsn, 'all');""" ################################################### #This command overrides in order to select #all, use this to load all data from a table #command = """ SELECT * FROM """+table_name+""";""" ################################################### cursor.execute(command) while True: rows = cursor.fetchmany(30000) x = cursor.description if not rows: break else: for v in x: print list(v)[0] for row in rows: for i in range(len(row)): if isinstance(row[i], unicode): row[i] = row[i].encode('utf-8') print row[i] for row in rows: row[0] = None temp = DataFrame(rows).to_csv(sep="|") buff = io.StringIO() buff.write(temp.decode('UTF-8')) buff2 = io.BytesIO(buff.getvalue().encode()) yield buff2
def get_column_names(self, tablename): ''' Takes a table name and returns a 2d array where each row contains info each particular column for a given table. I should probably change this to 'get_column_data', since you have to parse the result to actually get the names of each column. ''' command = """select * from INFORMATION_SCHEMA.COLUMNS where TABLE_NAME='""" + tablename + """';""" column_names_dict = {} with open_sql_db_connection(self.con_params) as cursor: cursor.execute(command) rows = cursor.fetchall() column_names_dict = {} return rows
def get_net_changes_sql2(self, table_name): ''' Return all the next changes for a given table. ''' from_lsn = self.get_min_lsn(table_name) to_lsn = self.end_time_lsn with open_sql_db_connection(self.con_params) as cursor: command = """ DECLARE @from_lsn binary(10), @to_lsn binary(10); SET @from_lsn =""" + str(from_lsn) + """; SET @to_lsn =""" + str(to_lsn) + """; SELECT * FROM cdc.fn_cdc_get_net_changes_dbo_""" + table_name + """(@from_lsn, @to_lsn, 'all');""" cursor.execute(command) rows = cursor.fetchall() cols = [str(col[0]) for col in cursor.description] res = '' if rows: res = DataFrame(rows).to_csv(sep="|") return rows, cols
def sam(self): ''' ''' with open_sql_db_connection(self.con_params) as cursor: command = """SELECT TOP 10 * FROM DEAA0 WHERE City LIKE 'A%' ORDER BY DEANumber""" cursor.execute(command) rows = cursor.fetchall() res = DataFrame(rows).to_csv(sep="|") for row in rows: ''' for i in range(len(row)): if isinstance(row[i], unicode): row[i] = row[i].encode('utf-8') print row[i] ''' print list(row) print res return
def get_net_changes_sql(self, table_name): ''' Return all the next changes for a given table. ''' with open_sql_db_connection(self.con_params) as cursor: #time = self.get_sql_end_date() command = """ DECLARE @begin_time datetime, @end_time datetime, @from_lsn binary(10), @to_lsn binary(10); SET @begin_time = 0; SET @end_time = GETDATE(); SET @from_lsn = sys.fn_cdc_map_time_to_lsn('smallest greater than or equal',@begin_time); SET @to_lsn = sys.fn_cdc_map_time_to_lsn('largest less than or equal', @end_time); SELECT * FROM cdc.fn_cdc_get_net_changes_dbo_""" + table_name + """(@from_lsn, @to_lsn, 'all');""" cursor.execute(command) rows = cursor.fetchall() cols = [str(col[0]) for col in cursor.description] res = '' if rows: res = DataFrame(rows).to_csv(sep="|") return rows, cols
def enable_CDC_on_tables(self): ''' Enables Change_Data_Capture on all user created tables in db ''' with open_sql_db_connection(self.con_params) as cursor: table_names_list = self.get_table_names() cdc_enabled_tables_list = self.get_tables_tracked_by_CDC() #this is annoying but it needs to ''' command = "exec msdb.dbo.rds_cdc_enable_db 'DEAHINPH'" cursor.execute(command) ''' for table_name in table_names_list: if not (table_name in cdc_enabled_tables_list): print "Table found not being tracked: ", table_name command = """exec sys.sp_cdc_enable_table @source_schema = N'dbo' , @source_name = N'""" + table_name + """' , @role_name = N'NULL';""" cursor.execute(command) else: print table_name, " IS BEING TRACKED BY CDC" return True