def get_max_pk_values(cursor, catalog_entry): database_name = common.get_database_name(catalog_entry) escaped_db = common.escape(database_name) escaped_table = common.escape(catalog_entry.table) key_properties = common.get_key_properties(catalog_entry) escaped_columns = [common.escape(c) for c in key_properties] sql = """SELECT {} FROM {}.{} """ select_column_clause = ", ".join( ["max(" + pk + ")" for pk in escaped_columns]) cursor.execute(sql.format(select_column_clause, escaped_db, escaped_table)) result = cursor.fetchone() processed_results = [] for bm in result: if isinstance(bm, (datetime.date, datetime.datetime, datetime.timedelta)): processed_results += [common.to_utc_datetime_str(bm)] elif bm is not None: processed_results += [bm] max_pk_values = {} if processed_results: max_pk_values = dict(zip(key_properties, processed_results)) return max_pk_values
def get_max_pk_values(cursor, catalog_entry): database_name = common.get_database_name(catalog_entry) escaped_db = common.escape(database_name) escaped_table = common.escape(catalog_entry.table) key_properties = common.get_key_properties(catalog_entry) escaped_columns = [common.escape(c) for c in key_properties] sql = """SELECT {} FROM {}.{} ORDER BY {} LIMIT 1 """ select_column_clause = ", ".join(escaped_columns) order_column_clause = ", ".join([pk + " DESC" for pk in escaped_columns]) cursor.execute( sql.format(select_column_clause, escaped_db, escaped_table, order_column_clause)) result = cursor.fetchone() if result: max_pk_values = dict(zip(key_properties, result)) else: max_pk_values = {} return max_pk_values
def generate_pk_clause(catalog_entry, state): key_properties = common.get_key_properties(catalog_entry) escaped_columns = [common.escape(c) for c in key_properties] where_clause = " AND ".join([pk + " > `{}`" for pk in escaped_columns]) order_by_clause = ", ".join(['`{}`, ' for pk in escaped_columns]) max_pk_values = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') last_pk_fetched = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched') if last_pk_fetched: pk_comparisons = ["({} > {} AND {} <= {})".format(common.escape(pk), last_pk_fetched[pk], common.escape(pk), max_pk_values[pk]) for pk in key_properties] else: pk_comparisons = ["{} <= {}".format(common.escape(pk), max_pk_values[pk]) for pk in key_properties] sql = " WHERE {} ORDER BY {} ASC".format(" AND ".join(pk_comparisons), ", ".join(escaped_columns)) return sql
def pks_are_auto_incrementing(mysql_conn, catalog_entry): database_name = common.get_database_name(catalog_entry) key_properties = common.get_key_properties(catalog_entry) if not key_properties: return False sql = """SELECT 1 FROM information_schema.columns WHERE table_schema = '{}' AND table_name = '{}' AND column_name = '{}' AND extra LIKE '%auto_increment%' """ with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: for pk in key_properties: cur.execute(sql.format(database_name, catalog_entry.table, pk)) result = cur.fetchone() if not result: return False return True
def pks_are_integer_or_varchar(mysql_conn, config, catalog_entry): database_name = common.get_database_name(catalog_entry) key_properties = common.get_key_properties(catalog_entry) if config.get('allow_non_auto_increment_pks') == 'true' and key_properties: valid_column_types = set([ 'tinyint', 'smallint' 'mediumint', 'int', 'bigint', 'varchar', 'char' ]) sql = """SELECT data_type FROM information_schema.columns WHERE table_schema = '{}' AND table_name = '{}' AND column_name = '{}' """ with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: for pk in key_properties: cur.execute( sql.format(database_name, catalog_entry.table, pk)) result = cur.fetchone() if not result: raise Exception( "Primary key column {} does not exist.".format(pk)) if result[0] not in valid_column_types: return False return True return False
def write_schema_message(catalog_entry, bookmark_properties=[]): key_properties = common.get_key_properties(catalog_entry) singer.write_message( singer.SchemaMessage(stream=catalog_entry.stream, schema=catalog_entry.schema.to_dict(), key_properties=key_properties, bookmark_properties=bookmark_properties))
def generate_pk_clause(catalog_entry, state): key_properties = common.get_key_properties(catalog_entry) escaped_columns = [common.escape(c) for c in key_properties] where_clause = " AND ".join([pk + " > `{}`" for pk in escaped_columns]) order_by_clause = ", ".join(['`{}`, ' for pk in escaped_columns]) max_pk_values = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') last_pk_fetched = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched') pk_comparisons = [] if not max_pk_values: return "" if last_pk_fetched: for pk in key_properties: column_type = catalog_entry.schema.properties.get(pk).type # quote last/max PK val if column is VARCHAR if 'string' in column_type: last_pk_val = "'" + last_pk_fetched[pk] + "'" max_pk_val = "'" + max_pk_values[pk] + "'" else: last_pk_val = last_pk_fetched[pk] max_pk_val = max_pk_values[pk] pk_comparisons.append("({} > {} AND {} <= {})".format( common.escape(pk), last_pk_val, common.escape(pk), max_pk_val)) else: for pk in key_properties: column_schema = catalog_entry.schema.properties.get(pk) column_type = column_schema.type # quote last/max PK val if column is VARCHAR if 'string' in column_type: pk_val = "'{}'".format(max_pk_values[pk]) else: pk_val = max_pk_values[pk] pk_comparisons.append("{} <= {}".format(common.escape(pk), pk_val)) sql = " WHERE {} ORDER BY {} ASC".format(" AND ".join(pk_comparisons), ", ".join(escaped_columns)) return sql
def do_sync_full_table(mysql_conn, catalog_entry, state, columns): LOGGER.info("Stream %s is using full table replication", catalog_entry.stream) key_properties = common.get_key_properties(catalog_entry) write_schema_message(catalog_entry) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) # Prefer initial_full_table_complete going forward singer.clear_bookmark(state, catalog_entry.tap_stream_id, "version") state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "initial_full_table_complete", True ) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def generate_pk_clause(catalog_entry, state): key_properties = common.get_key_properties(catalog_entry) max_pk_values = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') last_pk_fetched = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched') last_pk_clause = '' max_pk_comparisons = [] if not max_pk_values: return "" if last_pk_fetched: for pk in key_properties: column_type = catalog_entry.schema.properties.get(pk).type # Add AND to interpolate along with max_pk_values clauses last_pk_clause = '({}) AND '.format( generate_pk_bookmark_clause(key_properties, last_pk_fetched, catalog_entry)) max_pk_comparisons.append("{} <= {}".format( common.escape(pk), quote_where_clause_value(max_pk_values[pk], column_type))) else: for pk in key_properties: column_schema = catalog_entry.schema.properties.get(pk) column_type = column_schema.type pk_val = quote_where_clause_value(max_pk_values[pk], column_type) max_pk_comparisons.append("{} <= {}".format( common.escape(pk), pk_val)) order_by_columns = [common.escape(c) for c in key_properties] sql = " WHERE {}{} ORDER BY {} ASC".format( last_pk_clause, " AND ".join(max_pk_comparisons), ", ".join(order_by_columns)) return sql
def sync_is_resumable(mysql_conn, catalog_entry): ''' In order to resume a full table sync, a table requires ''' database_name = common.get_database_name(catalog_entry) key_properties = common.get_key_properties(catalog_entry) if not key_properties: return False sql = """SELECT data_type FROM information_schema.columns WHERE table_schema = '{}' AND table_name = '{}' AND column_name = '{}' """ with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: for pk in key_properties: cur.execute(sql.format(database_name, catalog_entry.table, pk)) result = cur.fetchone() if not result: raise Exception( "Primary key column {} does not exist.".format(pk)) if result[0] not in RESUMABLE_PK_TYPES: LOGGER.warn( "Found primary key column %s with type %s. Will not be able " + "to resume interrupted FULL_TABLE sync using this key.", pk, result[0]) return False return True
def do_sync_historical_binlog(mysql_conn, config, catalog_entry, state, columns): binlog.verify_binlog_config(mysql_conn) is_view = common.get_is_view(catalog_entry) key_properties = common.get_key_properties(catalog_entry) if is_view: raise Exception("Unable to replicate stream({}) with binlog because it is a view.".format(catalog_entry.stream)) log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_file') log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_pos') max_pk_values = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') last_pk_fetched = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched') write_schema_message(catalog_entry) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) if log_file and log_pos and max_pk_values: LOGGER.info("Resuming initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) else: LOGGER.info("Performing initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'initial_binlog_complete', False) current_log_file, current_log_pos = binlog.fetch_current_log_file_and_pos(mysql_conn) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) if full_table.sync_is_resumable(mysql_conn, catalog_entry): # We must save log_file and log_pos across FULL_TABLE syncs when performing # a resumable full table sync state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', current_log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', current_log_pos) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) else: full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', current_log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', current_log_pos)