def do_sync(conn_config,
            catalog,
            default_replication_method,
            state,
            state_file=None):
    """
    Orchestrates sync of all streams
    """
    currently_syncing = singer.get_currently_syncing(state)
    streams = list(filter(is_selected_via_metadata, catalog['streams']))
    streams.sort(key=lambda s: s['tap_stream_id'])
    LOGGER.info("Selected streams: %s ", [s['tap_stream_id'] for s in streams])
    if any_logical_streams(streams, default_replication_method):
        # Use of logical replication requires fetching an lsn
        end_lsn = logical_replication.fetch_current_lsn(conn_config)
        LOGGER.debug("end_lsn = %s ", end_lsn)
    else:
        end_lsn = None

    refresh_streams_schema(conn_config, streams)

    sync_method_lookup, traditional_streams, logical_streams = \
        sync_method_for_streams(streams, state, default_replication_method)

    if currently_syncing:
        LOGGER.debug("Found currently_syncing: %s", currently_syncing)

        currently_syncing_stream = list(
            filter(lambda s: s['tap_stream_id'] == currently_syncing,
                   traditional_streams))

        if not currently_syncing_stream:
            LOGGER.warning(
                "unable to locate currently_syncing(%s) amongst selected traditional streams(%s). "
                "Will ignore", currently_syncing,
                {s['tap_stream_id']
                 for s in traditional_streams})

        other_streams = list(
            filter(lambda s: s['tap_stream_id'] != currently_syncing,
                   traditional_streams))
        traditional_streams = currently_syncing_stream + other_streams
    else:
        LOGGER.info("No streams marked as currently_syncing in state file")

    for stream in traditional_streams:
        state = sync_traditional_stream(
            conn_config, stream, state,
            sync_method_lookup[stream['tap_stream_id']], end_lsn)

    logical_streams.sort(key=lambda s: metadata.to_map(s['metadata']).get(
        ()).get('database-name'))
    for dbname, streams in itertools.groupby(
            logical_streams, lambda s: metadata.to_map(s['metadata']).get(
                ()).get('database-name')):
        conn_config['dbname'] = dbname
        state = sync_logical_streams(conn_config, list(streams), state,
                                     end_lsn, state_file)
    return state
def consume_message(streams, state, msg, time_extracted, conn_info):
    # Strip leading comma generated by write-in-chunks and parse valid JSON
    try:
        payload = json.loads(msg.payload.lstrip(','))
    except Exception:
        return state

    lsn = msg.data_start

    streams_lookup = {s['tap_stream_id']: s for s in streams}

    tap_stream_id = post_db.compute_tap_stream_id(payload['schema'],
                                                  payload['table'])
    if streams_lookup.get(tap_stream_id) is None:
        return state

    target_stream = streams_lookup[tap_stream_id]

    if payload['kind'] not in {'insert', 'update', 'delete'}:
        raise UnsupportedPayloadKindError(
            f"unrecognized replication operation: {payload['kind']}")

    # Get the additional fields in payload that are not in schema properties:
    # only inserts and updates have the list of columns that can be used to detect any different in columns
    diff = set()
    if payload['kind'] in {'insert', 'update'}:
        diff = set(payload['columnnames']).difference(
            target_stream['schema']['properties'].keys())

    # if there is new columns in the payload that are not in the schema properties then refresh the stream schema
    if diff:
        LOGGER.info(
            'Detected new columns "%s", refreshing schema of stream %s', diff,
            target_stream['stream'])
        # encountered a column that is not in the schema
        # refresh the stream schema and metadata by running discovery
        refresh_streams_schema(conn_info, [target_stream])

        # add the automatic properties back to the stream
        add_automatic_properties(target_stream,
                                 conn_info.get('debug_lsn', False))

        # publish new schema
        sync_common.send_schema_message(target_stream, ['lsn'])

    stream_version = get_stream_version(target_stream['tap_stream_id'], state)
    stream_md_map = metadata.to_map(target_stream['metadata'])

    desired_columns = {
        c
        for c in target_stream['schema']['properties'].keys()
        if sync_common.should_sync_column(stream_md_map, c)
    }

    if payload['kind'] in {'insert', 'update'}:
        col_names = []
        col_vals = []

        for idx, col in enumerate(payload['columnnames']):
            if col in desired_columns:
                col_names.append(col)
                col_vals.append(payload['columnvalues'][idx])

        col_names.append('_sdc_deleted_at')
        col_vals.append(None)

        if conn_info.get('debug_lsn'):
            col_names.append('_sdc_lsn')
            col_vals.append(str(lsn))

        record_message = row_to_singer_message(target_stream, col_vals,
                                               stream_version, col_names,
                                               time_extracted, stream_md_map,
                                               conn_info)

    elif payload['kind'] == 'delete':
        col_names = []
        col_vals = []
        for idx, col in enumerate(payload['oldkeys']['keynames']):
            if col in desired_columns:
                col_names.append(col)
                col_vals.append(payload['oldkeys']['keyvalues'][idx])

        col_names.append('_sdc_deleted_at')
        col_vals.append(singer.utils.strftime(time_extracted))

        if conn_info.get('debug_lsn'):
            col_names.append('_sdc_lsn')
            col_vals.append(str(lsn))

        record_message = row_to_singer_message(target_stream, col_vals,
                                               stream_version, col_names,
                                               time_extracted, stream_md_map,
                                               conn_info)

    singer.write_message(record_message)
    state = singer.write_bookmark(state, target_stream['tap_stream_id'], 'lsn',
                                  lsn)

    return state
Esempio n. 3
0
    def test_refresh_streams_schema(self):
        conn_config = get_test_connection_config()

        streams = [{
            'table_name':
            self.table_name,
            'stream':
            self.table_name,
            'tap_stream_id':
            f'public-{self.table_name}',
            'schema': [],
            'metadata': [{
                'breadcrumb': [],
                'metadata': {
                    'replication-method': 'LOG_BASED',
                    'table-key-properties': ['some_id'],
                    'row-count': 1000,
                }
            }]
        }]

        stream_utils.refresh_streams_schema(conn_config, streams)

        self.assertEqual(len(streams), 1)
        self.assertEqual(self.table_name, streams[0].get('table_name'))
        self.assertEqual(self.table_name, streams[0].get('stream'))

        streams[0]['metadata'].sort(key=lambda md: md['breadcrumb'])

        self.assertEqual(
            metadata.to_map(streams[0]['metadata']), {
                (): {
                    'table-key-properties': ['id'],
                    'database-name': 'postgres',
                    'schema-name': 'public',
                    'is-view': False,
                    'row-count': 0,
                    'replication-method': 'LOG_BASED'
                },
                ('properties', 'character-varying_name'): {
                    'inclusion': 'available',
                    'sql-datatype': 'character varying',
                    'selected-by-default': True
                },
                ('properties', 'id'): {
                    'inclusion': 'automatic',
                    'sql-datatype': 'integer',
                    'selected-by-default': True
                },
                ('properties', 'varchar-name'): {
                    'inclusion': 'available',
                    'sql-datatype': 'character varying',
                    'selected-by-default': True
                },
                ('properties', 'text-name'): {
                    'inclusion': 'available',
                    'sql-datatype': 'text',
                    'selected-by-default': True
                },
                ('properties', 'char_name'): {
                    'selected-by-default': True,
                    'inclusion': 'available',
                    'sql-datatype': 'character'
                }
            })

        self.assertEqual(
            {
                'properties': {
                    'id': {
                        'type': ['integer'],
                        'maximum': 2147483647,
                        'minimum': -2147483648
                    },
                    'character-varying_name': {
                        'type': ['null', 'string']
                    },
                    'varchar-name': {
                        'type': ['null', 'string'],
                        'maxLength': 28
                    },
                    'char_name': {
                        'type': ['null', 'string'],
                        'maxLength': 10
                    },
                    'text-name': {
                        'type': ['null', 'string']
                    }
                },
                'type': 'object',
                'definitions': BASE_RECURSIVE_SCHEMAS
            }, streams[0].get('schema'))