def fetch_current_scn(conn_config): connection = orc_db.open_connection(conn_config) cur = connection.cursor() current_scn = cur.execute("SELECT current_scn FROM V$DATABASE").fetchall()[0][0] cur.close() connection.close() return current_scn
def sync_tables(conn_config, streams, state, end_scn, scn_window_size = None): connection = orc_db.open_connection(conn_config) if not verify_db_supplemental_log_level(connection): for stream in streams: if not verify_table_supplemental_log_level(stream, connection): raise Exception(""" Unable to replicate with logminer for stream({}) because supplmental_log_data is not set to 'ALL' for either the table or the database. Please run: ALTER DATABASE ADD SUPPLEMENTAL LOG DATA (ALL) COLUMNS; """.format(stream.tap_stream_id)) cur = connection.cursor() cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'") cur.execute("""ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'""") cur.execute("""ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'""") cur.execute("""ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'""") start_scn_window = min([get_bookmark(state, s.tap_stream_id, 'scn') for s in streams]) while start_scn_window < end_scn: stop_scn_window = end_scn if SCN_WINDOW_SIZE: stop_scn_window = start_scn_window + SCN_WINDOW_SIZE if stop_scn_window > end_scn: stop_scn_window = end_scn state = sync_tables_logminer(cur, streams, state, start_scn_window, stop_scn_window) start_scn_window = stop_scn_window cur.close() connection.close()
def sync_view(conn_config, stream, state, desired_columns): connection = orc_db.open_connection(conn_config) connection.outputtypehandler = common.OutputTypeHandler cur = connection.cursor() cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'") cur.execute( """ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'""" ) cur.execute( """ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'""" ) cur.execute( """ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'""" ) time_extracted = utils.now() #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream.tap_stream_id, 'version') is None #pick a new table version nascent_stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream.tap_stream_id, 'version', nascent_stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) # cur = connection.cursor() md = metadata.to_map(stream.metadata) schema_name = md.get(()).get('schema-name') escaped_columns = map(lambda c: common.prepare_columns_sql(stream, c), desired_columns) escaped_schema = schema_name escaped_table = stream.table activate_version_message = singer.ActivateVersionMessage( stream=stream.tap_stream_id, version=nascent_stream_version) if first_run: singer.write_message(activate_version_message) with metrics.record_counter(None) as counter: select_sql = 'SELECT {} FROM {}.{}'.format(','.join(escaped_columns), escaped_schema, escaped_table) LOGGER.info("select %s", select_sql) for row in cur.execute(select_sql): record_message = common.row_to_singer_message( stream, row, nascent_stream_version, desired_columns, time_extracted) singer.write_message(record_message) counter.increment() #always send the activate version whether first run or subsequent singer.write_message(activate_version_message) cur.close() connection.close() return state
def do_discovery(conn_config, filter_schemas): LOGGER.info("starting discovery") connection = orc_db.open_connection(conn_config) cur = connection.cursor() row_counts = produce_row_counts(connection, filter_schemas) table_info = {} binds_sql = [":{}".format(b) for b in range(len(filter_schemas))] sql = filter_schemas_sql_clause( """ SELECT owner, table_name FROM all_tables WHERE owner != 'SYS'""", binds_sql) LOGGER.info("fetching tables: %s %s", sql, filter_schemas) for row in cur.execute(sql, filter_schemas): schema = row[0] table = row[1] if schema not in table_info: table_info[schema] = {} is_view = False table_info[schema][table] = { 'row_count': row_counts[table], 'is_view': is_view } sql = filter_schemas_sql_clause( """ SELECT owner, view_name FROM sys.all_views WHERE owner != 'SYS'""", binds_sql) LOGGER.info("fetching views") for row in cur.execute(sql, filter_schemas): view_name = row[1] schema = row[0] if schema not in table_info: table_info[schema] = {} table_info[schema][view_name] = {'is_view': True} # If table_info is empty, send a SYMON error that no table was found with current input. if not table_info: cur.close() connection.close() # Raise database error because only database error messages are interpretted during discovery raise cx_Oracle.DatabaseError( 'SYM-00001: No Table Found in the Database') catalog = discover_columns(connection, table_info, filter_schemas) json.dump(catalog, sys.stdout, indent=2) cur.close() connection.close() return catalog
def do_discovery(conn_config, filter_schemas): LOGGER.info("starting discovery") connection = orc_db.open_connection(conn_config) cur = connection.cursor() row_counts = produce_row_counts(connection, filter_schemas) table_info = {} binds_sql = [":{}".format(b) for b in range(len(filter_schemas))] sql = filter_schemas_sql_clause(""" SELECT owner, table_name FROM all_tables WHERE owner != 'SYS'""", binds_sql) LOGGER.info("fetching tables: %s %s", sql, filter_schemas) for row in cur.execute(sql, filter_schemas): schema = row[0] table = row[1] if schema not in table_info: table_info[schema] = {} is_view = False table_info[schema][table] = { 'row_count': row_counts[table], 'is_view': is_view } sql = filter_schemas_sql_clause(""" SELECT owner, view_name FROM sys.all_views WHERE owner != 'SYS'""", binds_sql) LOGGER.info("fetching views") for row in cur.execute(sql, filter_schemas): view_name = row[1] schema = row[0] if schema not in table_info: table_info[schema] = {} table_info[schema][view_name] = { 'is_view': True } catalog = discover_columns(connection, table_info, filter_schemas) dump_catalog(catalog) cur.close() connection.close() return catalog
def sync_table(conn_config, stream, state, desired_columns): connection = orc_db.open_connection(conn_config) connection.outputtypehandler = common.OutputTypeHandler cur = connection.cursor() cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'") cur.execute( """ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'""" ) cur.execute( """ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'""" ) cur.execute( """ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'""" ) time_extracted = utils.now() stream_version = singer.get_bookmark(state, stream.tap_stream_id, 'version') # If there was no bookmark for stream_version, it is the first time # this table is being sync'd, so get a new version, write to # state if stream_version is None: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream.tap_stream_id, 'version', stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) activate_version_message = singer.ActivateVersionMessage( stream=stream.stream, version=stream_version) singer.write_message(activate_version_message) md = metadata.to_map(stream.metadata) schema_name = md.get(()).get('schema-name') escaped_columns = map(lambda c: common.prepare_columns_sql(stream, c), desired_columns) escaped_schema = schema_name escaped_table = stream.table replication_key = md.get((), {}).get('replication-key') #escaped_replication_key = common.prepare_columns_sql(stream, replication_key) replication_key_value = singer.get_bookmark(state, stream.tap_stream_id, 'replication_key_value') replication_key_sql_datatype = md.get( ('properties', replication_key)).get('sql-datatype') with metrics.record_counter(None) as counter: if replication_key_value: LOGGER.info("Resuming Incremental replication from %s = %s", replication_key, replication_key_value) casted_where_clause_arg = common.prepare_where_clause_arg( replication_key_value, replication_key_sql_datatype) select_sql = """SELECT {} FROM {}.{} WHERE {} >= {} ORDER BY {} ASC """.format(','.join(escaped_columns), escaped_schema, escaped_table, replication_key, casted_where_clause_arg, replication_key) else: select_sql = """SELECT {} FROM {}.{} ORDER BY {} ASC """.format(','.join(escaped_columns), escaped_schema, escaped_table, replication_key) rows_saved = 0 LOGGER.info("select %s", select_sql) for row in cur.execute(select_sql): record_message = common.row_to_singer_message( stream, row, stream_version, desired_columns, time_extracted) singer.write_message(record_message) rows_saved = rows_saved + 1 #Picking a replication_key with NULL values will result in it ALWAYS been synced which is not great #event worse would be allowing the NULL value to enter into the state if record_message.record[replication_key] is not None: state = singer.write_bookmark( state, stream.tap_stream_id, 'replication_key_value', record_message.record[replication_key]) if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) counter.increment() cur.close() connection.close() return state
def sync_table(conn_config, stream, state, desired_columns): connection = orc_db.open_connection(conn_config) connection.outputtypehandler = common.OutputTypeHandler cur = connection.cursor() cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'") cur.execute("""ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'""") cur.execute("""ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'""") cur.execute("""ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'""") time_extracted = utils.now() #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream.tap_stream_id, 'version') is None #pick a new table version IFF we do not have an ORA_ROWSCN in our state #the presence of an ORA_ROWSCN indicates that we were interrupted last time through if singer.get_bookmark(state, stream.tap_stream_id, 'ORA_ROWSCN') is None: nascent_stream_version = int(time.time() * 1000) else: nascent_stream_version = singer.get_bookmark(state, stream.tap_stream_id, 'version') state = singer.write_bookmark(state, stream.tap_stream_id, 'version', nascent_stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) # cur = connection.cursor() md = metadata.to_map(stream.metadata) schema_name = md.get(()).get('schema-name') escaped_columns = map(lambda c: common.prepare_columns_sql(stream, c), desired_columns) escaped_schema = schema_name escaped_table = stream.table activate_version_message = singer.ActivateVersionMessage( stream=stream.stream, version=nascent_stream_version) if first_run: singer.write_message(activate_version_message) with metrics.record_counter(None) as counter: ora_rowscn = singer.get_bookmark(state, stream.tap_stream_id, 'ORA_ROWSCN') if ora_rowscn: LOGGER.info("Resuming Full Table replication %s from ORA_ROWSCN %s", nascent_stream_version, ora_rowscn) select_sql = """SELECT {}, ORA_ROWSCN FROM {}.{} WHERE ORA_ROWSCN >= {} ORDER BY ORA_ROWSCN ASC """.format(','.join(escaped_columns), escaped_schema, escaped_table, ora_rowscn) else: select_sql = """SELECT {}, ORA_ROWSCN FROM {}.{} ORDER BY ORA_ROWSCN ASC""".format(','.join(escaped_columns), escaped_schema, escaped_table) rows_saved = 0 LOGGER.info("select %s", select_sql) for row in cur.execute(select_sql): ora_rowscn = row[-1] row = row[:-1] record_message = common.row_to_singer_message(stream, row, nascent_stream_version, desired_columns, time_extracted) singer.write_message(record_message) state = singer.write_bookmark(state, stream.tap_stream_id, 'ORA_ROWSCN', ora_rowscn) rows_saved = rows_saved + 1 if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) counter.increment() state = singer.write_bookmark(state, stream.tap_stream_id, 'ORA_ROWSCN', None) #always send the activate version whether first run or subsequent singer.write_message(activate_version_message) cur.close() connection.close() return state
def sync_tables(conn_config, streams, state, end_scn): connection = orc_db.open_connection(conn_config) if not verify_db_supplemental_log_level(connection): for stream in streams: if not verify_table_supplemental_log_level(stream, connection): raise Exception(""" Unable to replicate with logminer for stream({}) because supplmental_log_data is not set to 'ALL' for either the table or the database. Please run: ALTER DATABASE ADD SUPPLEMENTAL LOG DATA (ALL) COLUMNS; """.format(stream.tap_stream_id)) cur = connection.cursor() cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'") cur.execute("""ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'""") cur.execute("""ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'""") cur.execute("""ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'""") start_scn = min([get_bookmark(state, s.tap_stream_id, 'scn') for s in streams]) time_extracted = utils.now() start_logmnr_sql = """BEGIN DBMS_LOGMNR.START_LOGMNR( startScn => {}, endScn => {}, OPTIONS => DBMS_LOGMNR.DICT_FROM_ONLINE_CATALOG + DBMS_LOGMNR.COMMITTED_DATA_ONLY + DBMS_LOGMNR.CONTINUOUS_MINE); END;""".format(start_scn, end_scn) LOGGER.info("Starting LogMiner for %s: %s -> %s", list(map(lambda s: s.tap_stream_id, streams)), start_scn, end_scn) LOGGER.info("%s",start_logmnr_sql) cur.execute(start_logmnr_sql) #mine changes for stream in streams: md_map = metadata.to_map(stream.metadata) desired_columns = [c for c in stream.schema.properties.keys() if common.should_sync_column(md_map, c)] redo_value_sql_clause = ",\n ".join(["""DBMS_LOGMNR.MINE_VALUE(REDO_VALUE, :{})""".format(idx+1) for idx,c in enumerate(desired_columns)]) undo_value_sql_clause = ",\n ".join(["""DBMS_LOGMNR.MINE_VALUE(UNDO_VALUE, :{})""".format(idx+1) for idx,c in enumerate(desired_columns)]) schema_name = md_map.get(()).get('schema-name') stream_version = get_stream_version(stream.tap_stream_id, state) mine_sql = """ SELECT OPERATION, SQL_REDO, SCN, CSCN, COMMIT_TIMESTAMP, {}, {} from v$logmnr_contents where table_name = :table_name AND operation in ('INSERT', 'UPDATE', 'DELETE') """.format(redo_value_sql_clause, undo_value_sql_clause) binds = [orc_db.fully_qualified_column_name(schema_name, stream.table, c) for c in desired_columns] + \ [orc_db.fully_qualified_column_name(schema_name, stream.table, c) for c in desired_columns] + \ [stream.table] rows_saved = 0 columns_for_record = desired_columns + ['scn', '_sdc_deleted_at'] with metrics.record_counter(None) as counter: LOGGER.info("Examing log for table %s", stream.tap_stream_id) common.send_schema_message(stream, ['lsn']) for op, redo, scn, cscn, commit_ts, *col_vals in cur.execute(mine_sql, binds): redo_vals = col_vals[0:len(desired_columns)] undo_vals = col_vals[len(desired_columns):] if op == 'INSERT' or op == 'UPDATE': redo_vals += [cscn, None] record_message = row_to_singer_message(stream, redo_vals, stream_version, columns_for_record, time_extracted) elif op == 'DELETE': undo_vals += [cscn, singer.utils.strftime(commit_ts.replace(tzinfo=pytz.UTC))] record_message = row_to_singer_message(stream, undo_vals, stream_version, columns_for_record, time_extracted) else: raise Exception("unrecognized logminer operation: {}".format(op)) singer.write_message(record_message) rows_saved = rows_saved + 1 counter.increment() state = singer.write_bookmark(state, stream.tap_stream_id, 'scn', int(cscn)) if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) for s in streams: LOGGER.info("updating bookmark for stream %s to end_lsn %s", s.tap_stream_id, end_scn) state = singer.write_bookmark(state, s.tap_stream_id, 'scn', end_scn) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) cur.close() connection.close() return state