def get_max_pk_values(cursor, catalog_entry): """Get actual max primary key values from database""" database_name = common.get_database_name(catalog_entry) escaped_db = common.escape(database_name) escaped_table = common.escape(catalog_entry.table) key_properties = common.get_key_properties(catalog_entry) escaped_columns = [common.escape(c) for c in key_properties] sql = """SELECT {} FROM {}.{} ORDER BY {} LIMIT 1 """ select_column_clause = ', '.join(escaped_columns) order_column_clause = ', '.join([pk + ' DESC' for pk in escaped_columns]) cursor.execute( sql.format(select_column_clause, escaped_db, escaped_table, order_column_clause)) result = cursor.fetchone() if result: max_pk_values = dict(zip(key_properties, result)) else: max_pk_values = {} return max_pk_values
def sync_streams(snowflake_conn, catalog, state): for catalog_entry in catalog.streams: columns = list(catalog_entry.schema.properties.keys()) if not columns: LOGGER.warning( 'There are no columns selected for stream %s, skipping it.', catalog_entry.stream) continue state = singer.set_currently_syncing(state, catalog_entry.tap_stream_id) # Emit a state message to indicate that we've started this stream singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) md_map = metadata.to_map(catalog_entry.metadata) replication_method = md_map.get((), {}).get('replication-method', "FULL_TABLE") database_name = common.get_database_name(catalog_entry) schema_name = common.get_schema_name(catalog_entry) with metrics.job_timer('sync_table') as timer: timer.tags['database'] = database_name timer.tags['table'] = catalog_entry.table LOGGER.info('Beginning to sync %s.%s.%s', database_name, schema_name, catalog_entry.table) if replication_method == 'INCREMENTAL': do_sync_incremental(snowflake_conn, catalog_entry, state, columns) elif replication_method == 'FULL_TABLE': do_sync_full_table(snowflake_conn, catalog_entry, state, columns) else: raise Exception( 'Only INCREMENTAL and FULL TABLE replication methods are supported' ) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def resolve_catalog(discovered_catalog, streams_to_sync): result = Catalog(streams=[]) # Iterate over the streams in the input catalog and match each one up # with the same stream in the discovered catalog. for catalog_entry in streams_to_sync: catalog_metadata = metadata.to_map(catalog_entry.metadata) replication_key = catalog_metadata.get((), {}).get('replication-key') discovered_table = discovered_catalog.get_stream( catalog_entry.tap_stream_id) database_name = common.get_database_name(catalog_entry) if not discovered_table: LOGGER.warning( 'Database %s table %s was selected but does not exist', database_name, catalog_entry.table) continue selected = { k for k, v in catalog_entry.schema.properties.items() if common.property_is_selected(catalog_entry, k) or k == replication_key } # These are the columns we need to select columns = desired_columns(selected, discovered_table.schema) result.streams.append( CatalogEntry(tap_stream_id=catalog_entry.tap_stream_id, metadata=catalog_entry.metadata, stream=catalog_entry.tap_stream_id, table=catalog_entry.table, schema=Schema( type='object', properties={ col: discovered_table.schema.properties[col] for col in columns }))) return result